diff --git a/docs/public/playground/assets/index-BYbIYCc0.js b/docs/public/playground/assets/index-Bl6wtlTo.js similarity index 64% rename from docs/public/playground/assets/index-BYbIYCc0.js rename to docs/public/playground/assets/index-Bl6wtlTo.js index af59445..d9a5d42 100644 --- a/docs/public/playground/assets/index-BYbIYCc0.js +++ b/docs/public/playground/assets/index-Bl6wtlTo.js @@ -107,7 +107,7 @@ fn bitonic_sort_local( data[global_idx] = shared_data[local_idx]; } } -`,g=[1024,10240,102400,1048576],_=1e4,v=class e{device;bufferManager;localPipeline=null;globalPipeline=null;bindGroupLayout=null;initialized=!1;preallocatedBuffer=null;_preallocatedSize=0;constructor(e){this.device=e.getDevice(),this.bufferManager=new f(this.device)}get preallocatedSize(){return this._preallocatedSize}preallocate(t){this.clearPreallocation();let n=e.nextPowerOf2(t);this.preallocatedBuffer=this.device.createBuffer({label:`preallocated-bitonic-data`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),this._preallocatedSize=t}clearPreallocation(){this.preallocatedBuffer&&(this.preallocatedBuffer.destroy(),this.preallocatedBuffer=null,this._preallocatedSize=0)}static nextPowerOf2(e){if(e<=0)return 1;if(!(e&e-1))return e;let t=1;for(;t0&&(e&e-1)==0}async initializePipelines(){if(this.initialized)return;let e=this.device.createShaderModule({label:`bitonic-sort-shader`,code:h}),t=(await e.getCompilationInfo()).messages.filter(e=>e.type===`error`);if(t.length>0)throw new a(`Bitonic shader compilation failed: ${t.map(e=>e.message).join(`, `)}`);this.bindGroupLayout=this.device.createBindGroupLayout({label:`bitonic-bind-group-layout`,entries:[{binding:0,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:1,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]});let n=this.device.createPipelineLayout({label:`bitonic-pipeline-layout`,bindGroupLayouts:[this.bindGroupLayout]});this.localPipeline=this.device.createComputePipeline({label:`bitonic-local-pipeline`,layout:n,compute:{module:e,entryPoint:`bitonic_sort_local`}}),this.globalPipeline=this.device.createComputePipeline({label:`bitonic-global-pipeline`,layout:n,compute:{module:e,entryPoint:`bitonic_sort_global`}}),this.initialized=!0}async sort(t,n){let r=performance.now();await this.initializePipelines();let i=t.length;if(i<=1)return{sortedData:new Uint32Array(t),gpuTimeMs:0,totalTimeMs:performance.now()-r};let o=e.nextPowerOf2(i),s=new Uint32Array(o);s.set(t);for(let e=i;e=i,u=new p,d,f,h;try{l?(this.device.queue.writeBuffer(c,0,s.buffer,s.byteOffset,s.byteLength),d=c):d=u.track(this.bufferManager.createStorageBuffer(s,`sort-data`),e=>this.bufferManager.releaseBuffer(e));let t=u.track(this.bufferManager.createUniformBuffer(16,`sort-uniforms`),e=>this.bufferManager.releaseBuffer(e)),n=this.bindGroupLayout;if(!n)throw new a(`Shader pipelines not initialized`);let r=this.device.createBindGroup({label:`bitonic-bind-group`,layout:n,entries:[{binding:0,resource:{buffer:d}},{binding:1,resource:{buffer:t}}]}),p=performance.now();if(!e.isPowerOf2(o))throw Error(`Invalid paddedSize: ${o} is not a power of 2`);let m=Math.ceil(o/256),g=Math.trunc(Math.log2(o));{let e=this.localPipeline;if(!e)throw new a(`Local pipeline not initialized`);let n=new Uint32Array([0,0,o,0]);this.device.queue.writeBuffer(t,0,n);let i=this.device.createCommandEncoder(),s=i.beginComputePass();s.setPipeline(e),s.setBindGroup(0,r),s.dispatchWorkgroups(m),s.end(),this.device.queue.submit([i.finish()])}let _=Math.trunc(Math.log2(256)),v=this.globalPipeline;if(!v)throw new a(`Global pipeline not initialized`);for(let e=_;e=0;n--){let i=new Uint32Array([e,n,o,0]);this.device.queue.writeBuffer(t,0,i);let a=this.device.createCommandEncoder(),s=a.beginComputePass();s.setPipeline(v),s.setBindGroup(0,r),s.dispatchWorkgroups(m),s.end(),this.device.queue.submit([a.finish()])}await this.device.queue.onSubmittedWorkDone();let y=performance.now();f=(await this.bufferManager.readBuffer(d,o*4)).slice(0,i),h=y-p}finally{u.releaseAll()}if(!f)throw Error(`Bitonic sort completed without producing output`);if(h===void 0)throw Error(`Bitonic sort completed without timing information`);let g=performance.now();if(n?.validate){let e=m.validate(t,f);if(!e.isValid)throw Error(`Sort validation failed: ${e.errors.join(`, `)}`)}return{sortedData:f,gpuTimeMs:h,totalTimeMs:g-r}}destroy(){this.clearPreallocation(),this.bufferManager.releaseAll(),this.localPipeline=null,this.globalPipeline=null,this.bindGroupLayout=null,this.initialized=!1}},y=`// Blelloch Scan WGSL Compute Shaders +`,g=[1024,10240,102400,1048576],_=1e4,v=class e{device;bufferManager;localPipeline=null;globalPipeline=null;bindGroupLayout=null;initialized=!1;preallocatedBuffer=null;_preallocatedSize=0;constructor(e){this.device=e.getDevice(),this.bufferManager=new f(this.device)}get preallocatedSize(){return this._preallocatedSize}preallocate(t){this.clearPreallocation();let n=e.nextPowerOf2(t);this.preallocatedBuffer=this.device.createBuffer({label:`preallocated-bitonic-data`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),this._preallocatedSize=t}clearPreallocation(){this.preallocatedBuffer&&(this.preallocatedBuffer.destroy(),this.preallocatedBuffer=null,this._preallocatedSize=0)}static nextPowerOf2(e){if(e<=0)return 1;if(!(e&e-1))return e;let t=1;for(;t0&&(e&e-1)==0}async initializePipelines(){if(this.initialized)return;let e=this.device.createShaderModule({label:`bitonic-sort-shader`,code:h}),t=(await e.getCompilationInfo()).messages.filter(e=>e.type===`error`);if(t.length>0)throw new a(`Bitonic shader compilation failed: ${t.map(e=>e.message).join(`, `)}`);this.bindGroupLayout=this.device.createBindGroupLayout({label:`bitonic-bind-group-layout`,entries:[{binding:0,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:1,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]});let n=this.device.createPipelineLayout({label:`bitonic-pipeline-layout`,bindGroupLayouts:[this.bindGroupLayout]});this.localPipeline=this.device.createComputePipeline({label:`bitonic-local-pipeline`,layout:n,compute:{module:e,entryPoint:`bitonic_sort_local`}}),this.globalPipeline=this.device.createComputePipeline({label:`bitonic-global-pipeline`,layout:n,compute:{module:e,entryPoint:`bitonic_sort_global`}}),this.initialized=!0}async sort(t,n){let r=performance.now();await this.initializePipelines();let i=t.length;if(i<=1)return{sortedData:new Uint32Array(t),gpuTimeMs:0,totalTimeMs:performance.now()-r};let o=e.nextPowerOf2(i),s=new Uint32Array(o);s.set(t);for(let e=i;e=i,u=new p,d,h,g;try{l?(this.device.queue.writeBuffer(c,0,s.buffer,s.byteOffset,s.byteLength),d=c):d=u.track(this.bufferManager.createStorageBuffer(s,`sort-data`),e=>this.bufferManager.releaseBuffer(e));let t=u.track(this.bufferManager.createUniformBuffer(16,`sort-uniforms`),e=>this.bufferManager.releaseBuffer(e)),n=this.bindGroupLayout;if(!n)throw new a(`Shader pipelines not initialized`);let r=this.device.createBindGroup({label:`bitonic-bind-group`,layout:n,entries:[{binding:0,resource:{buffer:d}},{binding:1,resource:{buffer:t}}]}),p=performance.now();if(!e.isPowerOf2(o))throw Error(`Invalid paddedSize: ${o} is not a power of 2`);let m=Math.ceil(o/256),_=Math.trunc(Math.log2(o)),v=Math.trunc(Math.log2(256)),y=this.localPipeline,b=this.globalPipeline;if(!y||!b)throw new a(`Sort pipelines not initialized`);let x=[{stage:0,passNum:0,isLocal:!0}];for(let e=v;e<_;e++)for(let t=e;t>=0;t--)x.push({stage:e,passNum:t,isLocal:!1});let S=new Uint32Array(x.length*4);for(let e=0;e, @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) workgroup_id: vec3 ) { let tid = local_id.x; - let gid = global_id.x; let block_id = workgroup_id.x; let n = scan_uniforms.data_size; // Get the prefix for this block (sum of all previous blocks) let block_prefix = block_sums[block_id]; - // Add block prefix to each element in this block - let idx = gid; - if (idx < n) { - scan_output[idx] = scan_output[idx] + block_prefix; + // Add block prefix to each element in this block (2 elements per thread) + let block_start = block_id * (SCAN_WORKGROUP_SIZE * 2u); + let idx0 = block_start + tid; + let idx1 = block_start + tid + SCAN_WORKGROUP_SIZE; + + if (idx0 < n) { + scan_output[idx0] = scan_output[idx0] + block_prefix; + } + if (idx1 < n) { + scan_output[idx1] = scan_output[idx1] + block_prefix; } -}`,b=256,x=b*2,S=class{device;bufferManager;blellochScanPipeline=null;scanBlockSumsPipeline=null;addBlockPrefixesPipeline=null;scanBindGroupLayout=null;initialized=!1;constructor(e){this.device=e.getDevice(),this.bufferManager=new f(this.device)}async initialize(){if(this.initialized)return;let e=this.device.createShaderModule({label:`scan-shader`,code:y}),t=(await e.getCompilationInfo()).messages.filter(e=>e.type===`error`);if(t.length>0)throw new a(`Scan shader compilation failed: ${t.map(e=>e.message).join(`, `)}`);this.scanBindGroupLayout=this.device.createBindGroupLayout({label:`scan-bind-group-layout`,entries:[{binding:0,visibility:GPUShaderStage.COMPUTE,buffer:{type:`read-only-storage`}},{binding:1,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:2,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:3,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]});let n=this.device.createPipelineLayout({label:`scan-pipeline-layout`,bindGroupLayouts:[this.scanBindGroupLayout]});this.blellochScanPipeline=this.device.createComputePipeline({label:`blelloch-scan-pipeline`,layout:n,compute:{module:e,entryPoint:`blelloch_scan`}}),this.scanBlockSumsPipeline=this.device.createComputePipeline({label:`scan-block-sums-pipeline`,layout:n,compute:{module:e,entryPoint:`scan_block_sums`}}),this.addBlockPrefixesPipeline=this.device.createComputePipeline({label:`add-block-prefixes-pipeline`,layout:n,compute:{module:e,entryPoint:`add_block_prefixes`}}),this.initialized=!0}async computeExclusivePrefixSum(e){if(!this.initialized)throw new a(`ScanModule not initialized. Call initialize() first.`);let t=e.length;if(t===0)return new Uint32Array;if(t===1)return new Uint32Array([0]);let n=Math.ceil(t/x),r=new p;try{let i=r.track(this.bufferManager.createStorageBuffer(e,`scan-input`),e=>this.bufferManager.releaseBuffer(e)),a=r.track(this.device.createBuffer({label:`scan-output`,size:f.alignSize(t*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),o=r.track(this.device.createBuffer({label:`scan-block-sums`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),s=r.track(this.bufferManager.createUniformBuffer(16,`scan-uniforms`),e=>this.bufferManager.releaseBuffer(e));return this.computePrefixSumGPU(i,a,o,s,t),await this.device.queue.onSubmittedWorkDone(),await this.bufferManager.readBuffer(a,t*4)}finally{r.releaseAll()}}computePrefixSumGPU(e,t,n,r,i){let o=this.scanBindGroupLayout,s=this.blellochScanPipeline,c=this.scanBlockSumsPipeline,l=this.addBlockPrefixesPipeline;if(!o||!s||!c||!l)throw new a(`Scan pipelines not initialized`);let u=Math.ceil(i/x),d=new Uint32Array([i,u,0,0]);this.device.queue.writeBuffer(r,0,d);let f=this.device.createCommandEncoder();{let i=this.device.createBindGroup({label:`blelloch-scan-bind-group`,layout:o,entries:[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:n}},{binding:3,resource:{buffer:r}}]}),a=f.beginComputePass();a.setPipeline(s),a.setBindGroup(0,i),a.dispatchWorkgroups(u),a.end()}if(u>1){let i=this.device.createBindGroup({label:`scan-block-sums-bind-group`,layout:o,entries:[{binding:0,resource:{buffer:n}},{binding:1,resource:{buffer:n}},{binding:2,resource:{buffer:n}},{binding:3,resource:{buffer:r}}]}),a=f.beginComputePass();a.setPipeline(c),a.setBindGroup(0,i),a.dispatchWorkgroups(1),a.end();{let i=this.device.createBindGroup({label:`add-block-prefixes-bind-group`,layout:o,entries:[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:n}},{binding:3,resource:{buffer:r}}]}),a=f.beginComputePass();a.setPipeline(l),a.setBindGroup(0,i),a.dispatchWorkgroups(u),a.end()}}this.device.queue.submit([f.finish()])}static getConstants(){return{scanWorkgroupSize:b,elementsPerScanBlock:x}}destroy(){this.bufferManager.releaseAll(),this.blellochScanPipeline=null,this.scanBlockSumsPipeline=null,this.addBlockPrefixesPipeline=null,this.scanBindGroupLayout=null,this.initialized=!1}},C=`// Radix Sort WGSL Compute Shaders +}`,b=256,x=b*2,S=x,C=class{device;bufferManager;blellochScanPipeline=null;scanBlockSumsPipeline=null;addBlockPrefixesPipeline=null;scanLayout=null;blockSumsScanLayout=null;addPrefixesLayout=null;initialized=!1;constructor(e){this.device=e.getDevice(),this.bufferManager=new f(this.device)}async initialize(){if(this.initialized)return;let e=this.device.createShaderModule({label:`scan-shader`,code:y}),t=(await e.getCompilationInfo()).messages.filter(e=>e.type===`error`);if(t.length>0)throw new a(`Scan shader compilation failed: ${t.map(e=>e.message).join(`, `)}`);this.scanLayout=this.device.createBindGroupLayout({label:`scan-layout`,entries:[{binding:0,visibility:GPUShaderStage.COMPUTE,buffer:{type:`read-only-storage`}},{binding:1,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:2,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:3,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]}),this.blockSumsScanLayout=this.device.createBindGroupLayout({label:`block-sums-scan-layout`,entries:[{binding:2,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:3,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]}),this.addPrefixesLayout=this.device.createBindGroupLayout({label:`add-prefixes-layout`,entries:[{binding:1,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:2,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:3,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]}),this.blellochScanPipeline=this.device.createComputePipeline({label:`blelloch-scan-pipeline`,layout:this.device.createPipelineLayout({bindGroupLayouts:[this.scanLayout]}),compute:{module:e,entryPoint:`blelloch_scan`}}),this.scanBlockSumsPipeline=this.device.createComputePipeline({label:`scan-block-sums-pipeline`,layout:this.device.createPipelineLayout({bindGroupLayouts:[this.blockSumsScanLayout]}),compute:{module:e,entryPoint:`scan_block_sums`}}),this.addBlockPrefixesPipeline=this.device.createComputePipeline({label:`add-block-prefixes-pipeline`,layout:this.device.createPipelineLayout({bindGroupLayouts:[this.addPrefixesLayout]}),compute:{module:e,entryPoint:`add_block_prefixes`}}),this.initialized=!0}async computeExclusivePrefixSum(e){if(!this.initialized)throw new a(`ScanModule not initialized. Call initialize() first.`);let t=e.length;if(t===0)return new Uint32Array;if(t===1)return new Uint32Array([0]);let n=Math.ceil(t/x),r=new p;try{let i=r.track(this.bufferManager.createStorageBuffer(e,`scan-input`),e=>this.bufferManager.releaseBuffer(e)),a=r.track(this.device.createBuffer({label:`scan-output`,size:f.alignSize(t*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),o=r.track(this.device.createBuffer({label:`scan-block-sums`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),s=r.track(this.bufferManager.createUniformBuffer(16,`scan-uniforms`),e=>this.bufferManager.releaseBuffer(e));return this.computePrefixSumGPU(i,a,o,s,t),await this.device.queue.onSubmittedWorkDone(),await this.bufferManager.readBuffer(a,t*4)}finally{r.releaseAll()}}computePrefixSumGPU(e,t,n,r,i){let o=this.scanLayout,s=this.blellochScanPipeline,c=this.addBlockPrefixesPipeline,l=this.addPrefixesLayout;if(!o||!s||!c||!l)throw new a(`Scan pipelines not initialized`);let u=Math.ceil(i/x);this.writeUniform(r,i,u);{let i=this.device.createBindGroup({label:`blelloch-scan-bind-group`,layout:o,entries:[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:n}},{binding:3,resource:{buffer:r}}]}),a=this.device.createCommandEncoder(),c=a.beginComputePass();c.setPipeline(s),c.setBindGroup(0,i),c.dispatchWorkgroups(u),c.end(),this.device.queue.submit([a.finish()])}if(u>1){this.scanBlockSumsRecursive(n,u,r),this.writeUniform(r,i,u);{let e=this.device.createBindGroup({label:`add-block-prefixes-bind-group`,layout:l,entries:[{binding:1,resource:{buffer:t}},{binding:2,resource:{buffer:n}},{binding:3,resource:{buffer:r}}]}),i=this.device.createCommandEncoder(),a=i.beginComputePass();a.setPipeline(c),a.setBindGroup(0,e),a.dispatchWorkgroups(u),a.end(),this.device.queue.submit([i.finish()])}}}scanBlockSumsRecursive(e,t,n){let r=this.scanBlockSumsPipeline,i=this.scanLayout,o=this.addBlockPrefixesPipeline,s=this.blockSumsScanLayout,c=this.addPrefixesLayout,l=this.blellochScanPipeline;if(!r||!i||!o||!s||!c||!l)throw new a(`Scan pipelines not initialized`);if(t<=S){this.writeUniform(n,t,t);let i=this.device.createBindGroup({label:`scan-block-sums-bind-group`,layout:s,entries:[{binding:2,resource:{buffer:e}},{binding:3,resource:{buffer:n}}]}),a=this.device.createCommandEncoder(),o=a.beginComputePass();o.setPipeline(r),o.setBindGroup(0,i),o.dispatchWorkgroups(1),o.end(),this.device.queue.submit([a.finish()]);return}let u=Math.ceil(t/x),d=this.device.createBuffer({label:`scan-recursive-temp-output`,size:f.alignSize(t*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),p=this.device.createBuffer({label:`scan-recursive-temp-block-sums`,size:f.alignSize(u*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST});this.writeUniform(n,t,u);{let t=this.device.createBindGroup({label:`recursive-blelloch-scan-bind-group`,layout:i,entries:[{binding:0,resource:{buffer:e}},{binding:1,resource:{buffer:d}},{binding:2,resource:{buffer:p}},{binding:3,resource:{buffer:n}}]}),r=this.device.createCommandEncoder(),a=r.beginComputePass();a.setPipeline(l),a.setBindGroup(0,t),a.dispatchWorkgroups(u),a.end(),this.device.queue.submit([r.finish()])}this.scanBlockSumsRecursive(p,u,n),this.writeUniform(n,t,u);{let e=this.device.createBindGroup({label:`recursive-add-prefixes-bind-group`,layout:c,entries:[{binding:1,resource:{buffer:d}},{binding:2,resource:{buffer:p}},{binding:3,resource:{buffer:n}}]}),t=this.device.createCommandEncoder(),r=t.beginComputePass();r.setPipeline(o),r.setBindGroup(0,e),r.dispatchWorkgroups(u),r.end(),this.device.queue.submit([t.finish()])}{let n=this.device.createCommandEncoder();n.copyBufferToBuffer(d,0,e,0,f.alignSize(t*4,4)),this.device.queue.submit([n.finish()])}d.destroy(),p.destroy()}writeUniform(e,t,n){let r=new Uint32Array([t,n,0,0]);this.device.queue.writeBuffer(e,0,r)}static getConstants(){return{scanWorkgroupSize:b,elementsPerScanBlock:x}}destroy(){this.bufferManager.releaseAll(),this.blellochScanPipeline=null,this.scanBlockSumsPipeline=null,this.addBlockPrefixesPipeline=null,this.scanLayout=null,this.blockSumsScanLayout=null,this.addPrefixesLayout=null,this.initialized=!1}},w=`// Radix Sort WGSL Compute Shaders // Implements 4-bit radix sort with histogram and scatter // Prefix sum computation is handled by scan.wgsl // @@ -444,11 +449,11 @@ fn scatter( output_data[global_offset] = value; } } -`,w=class{device;bufferManager;scanModule;histogramPipeline=null;scatterPipeline=null;bindGroupLayout=null;preallocatedBuffers=null;_preallocatedSize=0;initialized=!1;constructor(e){this.device=e.getDevice(),this.bufferManager=new f(this.device),this.scanModule=new S(e)}get preallocatedSize(){return this._preallocatedSize}preallocate(e){this.clearPreallocation();let{elementsPerScanBlock:t}=S.getConstants(),n=16*Math.ceil(e/256),r=Math.ceil(n/t);this.preallocatedBuffers={input:this.device.createBuffer({label:`preallocated-radix-input`,size:f.alignSize(e*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),output:this.device.createBuffer({label:`preallocated-radix-output`,size:f.alignSize(e*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),histogram:this.device.createBuffer({label:`preallocated-radix-histogram`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),prefixSum:this.device.createBuffer({label:`preallocated-radix-prefix-sum`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),blockSums:this.device.createBuffer({label:`preallocated-radix-block-sums`,size:f.alignSize(r*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})},this._preallocatedSize=e}clearPreallocation(){this.preallocatedBuffers&&(this.preallocatedBuffers.input.destroy(),this.preallocatedBuffers.output.destroy(),this.preallocatedBuffers.histogram.destroy(),this.preallocatedBuffers.prefixSum.destroy(),this.preallocatedBuffers.blockSums.destroy(),this.preallocatedBuffers=null,this._preallocatedSize=0)}async initializePipelines(){if(this.initialized)return;let e=this.device.createShaderModule({label:`radix-sort-shader`,code:C}),t=(await e.getCompilationInfo()).messages.filter(e=>e.type===`error`);if(t.length>0)throw new a(`Radix shader compilation failed: ${t.map(e=>e.message).join(`, `)}`);this.bindGroupLayout=this.device.createBindGroupLayout({label:`radix-bind-group-layout`,entries:[{binding:0,visibility:GPUShaderStage.COMPUTE,buffer:{type:`read-only-storage`}},{binding:1,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:2,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:3,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:4,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]});let n=this.device.createPipelineLayout({label:`radix-pipeline-layout`,bindGroupLayouts:[this.bindGroupLayout]});this.histogramPipeline=this.device.createComputePipeline({label:`radix-histogram-pipeline`,layout:n,compute:{module:e,entryPoint:`compute_histogram`}}),this.scatterPipeline=this.device.createComputePipeline({label:`radix-scatter-pipeline`,layout:n,compute:{module:e,entryPoint:`scatter`}}),await this.scanModule.initialize(),this.initialized=!0}async sort(e,t){let n=performance.now();await this.initializePipelines();let r=e.length;if(r<=1)return{sortedData:new Uint32Array(e),gpuTimeMs:0,totalTimeMs:performance.now()-n};let{elementsPerScanBlock:i}=S.getConstants(),o=Math.ceil(r/256),s=16*o,c=Math.ceil(s/i),l=this.preallocatedBuffers,u=l!==null&&this._preallocatedSize>=r,d=new p,h,g,_,v,y,b,x;try{let t,n;u?(this.device.queue.writeBuffer(l.input,0,e.buffer,e.byteOffset,e.byteLength),h=l.input,g=l.output,_=l.histogram,v=l.prefixSum,y=l.blockSums,t=d.track(this.bufferManager.createUniformBuffer(16,`radix-uniforms`),e=>this.bufferManager.releaseBuffer(e)),n=d.track(this.bufferManager.createUniformBuffer(16,`scan-uniforms`),e=>this.bufferManager.releaseBuffer(e))):(h=d.track(this.bufferManager.createStorageBuffer(e,`radix-input`),e=>this.bufferManager.releaseBuffer(e)),g=d.track(this.device.createBuffer({label:`radix-output`,size:f.alignSize(r*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),_=d.track(this.device.createBuffer({label:`radix-histogram`,size:f.alignSize(s*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),v=d.track(this.device.createBuffer({label:`radix-prefix-sum`,size:f.alignSize(s*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),y=d.track(this.device.createBuffer({label:`radix-block-sums`,size:f.alignSize(c*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),t=d.track(this.bufferManager.createUniformBuffer(16,`radix-uniforms`),e=>this.bufferManager.releaseBuffer(e)),n=d.track(this.bufferManager.createUniformBuffer(16,`scan-uniforms`),e=>this.bufferManager.releaseBuffer(e)));let i=h,p=g,m=performance.now();for(let e=0;e<8;e++){let c=e*4,l=new Uint32Array(s);this.device.queue.writeBuffer(_,0,l);let u=new Uint32Array([c,r,o,0]);this.device.queue.writeBuffer(t,0,u);let d=this.bindGroupLayout;if(!d)throw new a(`Shader pipelines not initialized`);let f=this.device.createBindGroup({label:`radix-bind-group-pass-${e}`,layout:d,entries:[{binding:0,resource:{buffer:i}},{binding:1,resource:{buffer:p}},{binding:2,resource:{buffer:_}},{binding:3,resource:{buffer:v}},{binding:4,resource:{buffer:t}}]});{let e=this.histogramPipeline;if(!e)throw new a(`Histogram pipeline not initialized`);let t=this.device.createCommandEncoder(),n=t.beginComputePass();n.setPipeline(e),n.setBindGroup(0,f),n.dispatchWorkgroups(o),n.end(),this.device.queue.submit([t.finish()])}this.scanModule.computePrefixSumGPU(_,v,y,n,s);{let e=this.scatterPipeline;if(!e)throw new a(`Scatter pipeline not initialized`);let t=this.device.createCommandEncoder(),n=t.beginComputePass();n.setPipeline(e),n.setBindGroup(0,f),n.dispatchWorkgroups(o),n.end(),this.device.queue.submit([t.finish()])}let m=i;i=p,p=m}await this.device.queue.onSubmittedWorkDone();let S=performance.now();b=await this.bufferManager.readBuffer(i,r*4),x=S-m}finally{d.releaseAll()}if(!b)throw Error(`Radix sort completed without producing output`);if(x===void 0)throw Error(`Radix sort completed without timing information`);let C=performance.now();if(t?.validate){let t=m.validate(e,b);if(!t.isValid)throw Error(`Sort validation failed: ${t.errors.join(`, `)}`)}return{sortedData:b,gpuTimeMs:x,totalTimeMs:C-n}}destroy(){this.clearPreallocation(),this.bufferManager.releaseAll(),this.scanModule.destroy(),this.histogramPipeline=null,this.scatterPipeline=null,this.bindGroupLayout=null,this.initialized=!1}},T=65536/Uint32Array.BYTES_PER_ELEMENT,E=4294967296;function D(e){if(typeof crypto<`u`&&typeof crypto.getRandomValues==`function`){for(let t=0;te+t,0)/e.length}static generateRandomData(e){return O(e)}runNativeSort(e){let t=new Uint32Array(e),n=performance.now();return t.sort(),performance.now()-n}async runSingle(t,n,r=5){let i=[],a=[];for(let o=0;o0?e.calculateAverage(a):void 0,totalTimeMs:o,iterations:r}}async runAll(t=[...g]){let n=[];for(let r of t){let t=await this.runSingle(`js-native`,r);n.push(t);let i=await this.runSingle(`bitonic`,r);i.speedupVsNative=e.calculateSpeedup(t.totalTimeMs,i.totalTimeMs),n.push(i);let a=await this.runSingle(`radix`,r);a.speedupVsNative=e.calculateSpeedup(t.totalTimeMs,a.totalTimeMs),n.push(a)}return n}static formatResults(e){let t=[];t.push(`| Algorithm | Size | Total Time (ms) | GPU Time (ms) | Speedup |`),t.push(`|-----------|------|-----------------|---------------|---------|`);for(let n of e){let e=n.speedupVsNative?n.speedupVsNative.toFixed(2)+`x`:`-`,r=n.gpuTimeMs===void 0?`-`:n.gpuTimeMs.toFixed(2);t.push(`| ${n.algorithm.padEnd(9)} | ${n.arraySize.toString().padStart(7)} | ${n.totalTimeMs.toFixed(2).padStart(15)} | ${r.padStart(13)} | ${e.padStart(7)} |`)}return t.join(` -`)}destroy(){this.bitonicSorter&&=(this.bitonicSorter.destroy(),null),this.radixSorter&&=(this.radixSorter.destroy(),null)}},A=document.getElementById(`unsupported`),j=document.getElementById(`app`),M=document.getElementById(`algorithm`),N=document.getElementById(`arraySize`),P=document.getElementById(`iterations`),F=document.getElementById(`runBtn`),I=document.getElementById(`runAllBtn`),L=document.getElementById(`status`),R=document.getElementById(`statusText`),z=document.getElementById(`progressBar`),B=document.getElementById(`resultsCard`),V=document.getElementById(`resultsBody`),H=null,U=null;async function W(){if(!l.isSupported()){G();return}try{H=new l,await H.initialize({powerPreference:`high-performance`}),U=new k(H),te(),K(`Ready to run benchmarks`,`success`)}catch(e){G(),console.error(`Failed to initialize WebGPU:`,e)}}function G(){A.style.display=`block`,j.style.display=`none`}function K(e,t=`info`){L.classList.add(`visible`),L.classList.remove(`error`,`success`),t===`error`&&L.classList.add(`error`),t===`success`&&L.classList.add(`success`),R.textContent=e}function q(e){z.style.width=`${e}%`}function J(e){F.disabled=!e,I.disabled=!e}function Y(e){return e<1?`${(e*1e3).toFixed(2)} µs`:e<1e3?`${e.toFixed(2)} ms`:`${(e/1e3).toFixed(2)} s`}function X(e){return e>=1e6?`${(e/1e6).toFixed(1)}M`:e>=1e3?`${(e/1e3).toFixed(0)}K`:e.toString()}function Z(e,t){let n=document.createElement(`tr`),r=t&&e.algorithm!==`js-native`?t/e.totalTimeMs:e.speedupVsNative,i=r&&r>1?`fast`:`slow`,a=r?`${r.toFixed(2)}x`:`-`;n.innerHTML=` +`,T=class{device;bufferManager;scanModule;histogramPipeline=null;scatterPipeline=null;bindGroupLayout=null;preallocatedBuffers=null;_preallocatedSize=0;initialized=!1;constructor(e){this.device=e.getDevice(),this.bufferManager=new f(this.device),this.scanModule=new C(e)}get preallocatedSize(){return this._preallocatedSize}preallocate(e){this.clearPreallocation();let{elementsPerScanBlock:t}=C.getConstants(),n=16*Math.ceil(e/256),r=Math.ceil(n/t);this.preallocatedBuffers={input:this.device.createBuffer({label:`preallocated-radix-input`,size:f.alignSize(e*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),output:this.device.createBuffer({label:`preallocated-radix-output`,size:f.alignSize(e*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),histogram:this.device.createBuffer({label:`preallocated-radix-histogram`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),prefixSum:this.device.createBuffer({label:`preallocated-radix-prefix-sum`,size:f.alignSize(n*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST}),blockSums:this.device.createBuffer({label:`preallocated-radix-block-sums`,size:f.alignSize(r*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})},this._preallocatedSize=e}clearPreallocation(){this.preallocatedBuffers&&(this.preallocatedBuffers.input.destroy(),this.preallocatedBuffers.output.destroy(),this.preallocatedBuffers.histogram.destroy(),this.preallocatedBuffers.prefixSum.destroy(),this.preallocatedBuffers.blockSums.destroy(),this.preallocatedBuffers=null,this._preallocatedSize=0)}async initializePipelines(){if(this.initialized)return;let e=this.device.createShaderModule({label:`radix-sort-shader`,code:w}),t=(await e.getCompilationInfo()).messages.filter(e=>e.type===`error`);if(t.length>0)throw new a(`Radix shader compilation failed: ${t.map(e=>e.message).join(`, `)}`);this.bindGroupLayout=this.device.createBindGroupLayout({label:`radix-bind-group-layout`,entries:[{binding:0,visibility:GPUShaderStage.COMPUTE,buffer:{type:`read-only-storage`}},{binding:1,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:2,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:3,visibility:GPUShaderStage.COMPUTE,buffer:{type:`storage`}},{binding:4,visibility:GPUShaderStage.COMPUTE,buffer:{type:`uniform`}}]});let n=this.device.createPipelineLayout({label:`radix-pipeline-layout`,bindGroupLayouts:[this.bindGroupLayout]});this.histogramPipeline=this.device.createComputePipeline({label:`radix-histogram-pipeline`,layout:n,compute:{module:e,entryPoint:`compute_histogram`}}),this.scatterPipeline=this.device.createComputePipeline({label:`radix-scatter-pipeline`,layout:n,compute:{module:e,entryPoint:`scatter`}}),await this.scanModule.initialize(),this.initialized=!0}async sort(e,t){let n=performance.now();await this.initializePipelines();let r=e.length;if(r<=1)return{sortedData:new Uint32Array(e),gpuTimeMs:0,totalTimeMs:performance.now()-n};let{elementsPerScanBlock:i}=C.getConstants(),o=Math.ceil(r/256),s=16*o,c=Math.ceil(s/i),l=this.preallocatedBuffers,u=l!==null&&this._preallocatedSize>=r,d=new p,h,g,_,v,y,b,x;try{let t,n;u?(this.device.queue.writeBuffer(l.input,0,e.buffer,e.byteOffset,e.byteLength),h=l.input,g=l.output,_=l.histogram,v=l.prefixSum,y=l.blockSums,t=d.track(this.bufferManager.createUniformBuffer(16,`radix-uniforms`),e=>this.bufferManager.releaseBuffer(e)),n=d.track(this.bufferManager.createUniformBuffer(16,`scan-uniforms`),e=>this.bufferManager.releaseBuffer(e))):(h=d.track(this.bufferManager.createStorageBuffer(e,`radix-input`),e=>this.bufferManager.releaseBuffer(e)),g=d.track(this.device.createBuffer({label:`radix-output`,size:f.alignSize(r*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),_=d.track(this.device.createBuffer({label:`radix-histogram`,size:f.alignSize(s*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),v=d.track(this.device.createBuffer({label:`radix-prefix-sum`,size:f.alignSize(s*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),y=d.track(this.device.createBuffer({label:`radix-block-sums`,size:f.alignSize(c*4,4),usage:GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST})),t=d.track(this.bufferManager.createUniformBuffer(16,`radix-uniforms`),e=>this.bufferManager.releaseBuffer(e)),n=d.track(this.bufferManager.createUniformBuffer(16,`scan-uniforms`),e=>this.bufferManager.releaseBuffer(e)));let i=h,p=g,m=performance.now(),S=new Uint32Array(s);for(let e=0;e<8;e++){let c=e*4;this.device.queue.writeBuffer(_,0,S);let l=new Uint32Array([c,r,o,0]);this.device.queue.writeBuffer(t,0,l);let u=this.bindGroupLayout;if(!u)throw new a(`Shader pipelines not initialized`);let d=this.device.createBindGroup({label:`radix-bind-group-pass-${e}`,layout:u,entries:[{binding:0,resource:{buffer:i}},{binding:1,resource:{buffer:p}},{binding:2,resource:{buffer:_}},{binding:3,resource:{buffer:v}},{binding:4,resource:{buffer:t}}]});{let e=this.histogramPipeline;if(!e)throw new a(`Histogram pipeline not initialized`);let t=this.device.createCommandEncoder(),n=t.beginComputePass();n.setPipeline(e),n.setBindGroup(0,d),n.dispatchWorkgroups(o),n.end(),this.device.queue.submit([t.finish()])}this.scanModule.computePrefixSumGPU(_,v,y,n,s);{let e=this.scatterPipeline;if(!e)throw new a(`Scatter pipeline not initialized`);let t=this.device.createCommandEncoder(),n=t.beginComputePass();n.setPipeline(e),n.setBindGroup(0,d),n.dispatchWorkgroups(o),n.end(),this.device.queue.submit([t.finish()])}let f=i;i=p,p=f}await this.device.queue.onSubmittedWorkDone();let C=performance.now();b=await this.bufferManager.readBuffer(i,r*4),x=C-m}finally{d.releaseAll()}if(!b)throw Error(`Radix sort completed without producing output`);if(x===void 0)throw Error(`Radix sort completed without timing information`);let S=performance.now();if(t?.validate){let t=m.validate(e,b);if(!t.isValid)throw Error(`Sort validation failed: ${t.errors.join(`, `)}`)}return{sortedData:b,gpuTimeMs:x,totalTimeMs:S-n}}destroy(){this.clearPreallocation(),this.bufferManager.releaseAll(),this.scanModule.destroy(),this.histogramPipeline=null,this.scatterPipeline=null,this.bindGroupLayout=null,this.initialized=!1}},E=65536/Uint32Array.BYTES_PER_ELEMENT,D=4294967296;function O(e){if(typeof crypto<`u`&&typeof crypto.getRandomValues==`function`){for(let t=0;te+t,0)/e.length}static generateRandomData(e){return k(e)}runNativeSort(e){let t=new Uint32Array(e),n=performance.now();return t.sort(),performance.now()-n}async runSingle(t,n,r=5){let i=[],a=[],o=null;t===`bitonic`?(this.bitonicSorter||=new v(this.context),o=this.bitonicSorter,this.bitonicSorter.preallocate(n)):t===`radix`&&(this.radixSorter||=new T(this.context),o=this.radixSorter,this.radixSorter.preallocate(n));for(let s=0;s0?e.calculateAverage(a):void 0,totalTimeMs:s,iterations:r}}async runAll(t=[...g]){let n=[];for(let r of t){let t=await this.runSingle(`js-native`,r);n.push(t);let i=await this.runSingle(`bitonic`,r);i.speedupVsNative=e.calculateSpeedup(t.totalTimeMs,i.totalTimeMs),n.push(i);let a=await this.runSingle(`radix`,r);a.speedupVsNative=e.calculateSpeedup(t.totalTimeMs,a.totalTimeMs),n.push(a)}return n}static formatResults(e){let t=[];t.push(`| Algorithm | Size | Total Time (ms) | GPU Time (ms) | Speedup |`),t.push(`|-----------|------|-----------------|---------------|---------|`);for(let n of e){let e=n.speedupVsNative?n.speedupVsNative.toFixed(2)+`x`:`-`,r=n.gpuTimeMs===void 0?`-`:n.gpuTimeMs.toFixed(2);t.push(`| ${n.algorithm.padEnd(9)} | ${n.arraySize.toString().padStart(7)} | ${n.totalTimeMs.toFixed(2).padStart(15)} | ${r.padStart(13)} | ${e.padStart(7)} |`)}return t.join(` +`)}destroy(){this.bitonicSorter&&=(this.bitonicSorter.destroy(),null),this.radixSorter&&=(this.radixSorter.destroy(),null)}},j=document.getElementById(`unsupported`),M=document.getElementById(`app`),N=document.getElementById(`algorithm`),P=document.getElementById(`arraySize`),F=document.getElementById(`iterations`),I=document.getElementById(`runBtn`),L=document.getElementById(`runAllBtn`),R=document.getElementById(`status`),z=document.getElementById(`statusText`),B=document.getElementById(`progressBar`),V=document.getElementById(`resultsCard`),H=document.getElementById(`resultsBody`),U=null,W=null;async function G(){if(!l.isSupported()){K();return}try{U=new l,await U.initialize({powerPreference:`high-performance`}),W=new A(U),ne(),q(`Ready to run benchmarks`,`success`)}catch(e){K(),console.error(`Failed to initialize WebGPU:`,e)}}function K(){j.style.display=`block`,M.style.display=`none`}function q(e,t=`info`){R.classList.add(`visible`),R.classList.remove(`error`,`success`),t===`error`&&R.classList.add(`error`),t===`success`&&R.classList.add(`success`),z.textContent=e}function J(e){B.style.width=`${e}%`}function Y(e){I.disabled=!e,L.disabled=!e}function X(e){return e<1?`${(e*1e3).toFixed(2)} µs`:e<1e3?`${e.toFixed(2)} ms`:`${(e/1e3).toFixed(2)} s`}function Z(e){return e>=1e6?`${(e/1e6).toFixed(1)}M`:e>=1e3?`${(e/1e3).toFixed(0)}K`:e.toString()}function Q(e,t){let n=document.createElement(`tr`),r=t&&e.algorithm!==`js-native`?t/e.totalTimeMs:e.speedupVsNative,i=r&&r>1?`fast`:`slow`,a=r?`${r.toFixed(2)}x`:`-`;n.innerHTML=` ${e.algorithm} - ${X(e.arraySize)} - ${Y(e.totalTimeMs)} - ${e.gpuTimeMs===void 0?`-`:Y(e.gpuTimeMs)} + ${Z(e.arraySize)} + ${X(e.totalTimeMs)} + ${e.gpuTimeMs===void 0?`-`:X(e.gpuTimeMs)} ${a} - `,V.appendChild(n),B.style.display=`block`}function Q(){V.innerHTML=``,B.style.display=`none`}async function $(){if(!U||!H)return;let e=M.value,t=parseInt(N.value),n=parseInt(P.value);J(!1),Q(),q(0);try{K(`Running JavaScript native sort (${X(t)} elements)...`),q(10);let r=await U.runSingle(`js-native`,t,n);if(Z(r),q(30),e===`bitonic`||e===`both`){K(`Running Bitonic Sort (${X(t)} elements)...`),Z(await U.runSingle(`bitonic`,t,n),r.totalTimeMs),q(e===`both`?60:90);let i=k.generateRandomData(Math.min(t,_)),a=new v(H),o=await a.sort(i),s=m.validate(i,o.sortedData);a.destroy(),s.isValid||console.warn(`Bitonic sort validation failed:`,s.errors)}if(e===`radix`||e===`both`){K(`Running Radix Sort (${X(t)} elements)...`),Z(await U.runSingle(`radix`,t,n),r.totalTimeMs),q(90);let e=k.generateRandomData(Math.min(t,_)),i=new w(H),a=await i.sort(e),o=m.validate(e,a.sortedData);i.destroy(),o.isValid||console.warn(`Radix sort validation failed:`,o.errors)}q(100),K(`Benchmark complete!`,`success`)}catch(e){K(`Error: ${e instanceof Error?e.message:String(e)}`,`error`),console.error(e)}finally{J(!0)}}async function ee(){if(!U)return;let e=[...g],t=parseInt(P.value);J(!1),Q(),q(0);try{let n=e.length*3,r=0;for(let i of e){K(`Running JS sort (${X(i)} elements)...`);let e=await U.runSingle(`js-native`,i,t);Z(e),r++,q(r/n*100),K(`Running Bitonic Sort (${X(i)} elements)...`),Z(await U.runSingle(`bitonic`,i,t),e.totalTimeMs),r++,q(r/n*100),K(`Running Radix Sort (${X(i)} elements)...`),Z(await U.runSingle(`radix`,i,t),e.totalTimeMs),r++,q(r/n*100)}K(`Full benchmark suite complete!`,`success`)}catch(e){K(`Error: ${e instanceof Error?e.message:String(e)}`,`error`),console.error(e)}finally{J(!0)}}function te(){F.addEventListener(`click`,$),I.addEventListener(`click`,ee)}W(); \ No newline at end of file + `,H.appendChild(n),V.style.display=`block`}function $(){H.innerHTML=``,V.style.display=`none`}async function ee(){if(!W||!U)return;let e=N.value,t=parseInt(P.value),n=parseInt(F.value);Y(!1),$(),J(0);try{q(`Running JavaScript native sort (${Z(t)} elements)...`),J(10);let r=await W.runSingle(`js-native`,t,n);if(Q(r),J(30),e===`bitonic`||e===`both`){q(`Running Bitonic Sort (${Z(t)} elements)...`),Q(await W.runSingle(`bitonic`,t,n),r.totalTimeMs),J(e===`both`?60:90);let i=A.generateRandomData(Math.min(t,_)),a=new v(U),o=await a.sort(i),s=m.validate(i,o.sortedData);a.destroy(),s.isValid||console.warn(`Bitonic sort validation failed:`,s.errors)}if(e===`radix`||e===`both`){q(`Running Radix Sort (${Z(t)} elements)...`),Q(await W.runSingle(`radix`,t,n),r.totalTimeMs),J(90);let e=A.generateRandomData(Math.min(t,_)),i=new T(U),a=await i.sort(e),o=m.validate(e,a.sortedData);i.destroy(),o.isValid||console.warn(`Radix sort validation failed:`,o.errors)}J(100),q(`Benchmark complete!`,`success`)}catch(e){q(`Error: ${e instanceof Error?e.message:String(e)}`,`error`),console.error(e)}finally{Y(!0)}}async function te(){if(!W)return;let e=[...g],t=parseInt(F.value);Y(!1),$(),J(0);try{let n=e.length*3,r=0;for(let i of e){q(`Running JS sort (${Z(i)} elements)...`);let e=await W.runSingle(`js-native`,i,t);Q(e),r++,J(r/n*100),q(`Running Bitonic Sort (${Z(i)} elements)...`),Q(await W.runSingle(`bitonic`,i,t),e.totalTimeMs),r++,J(r/n*100),q(`Running Radix Sort (${Z(i)} elements)...`),Q(await W.runSingle(`radix`,i,t),e.totalTimeMs),r++,J(r/n*100)}q(`Full benchmark suite complete!`,`success`)}catch(e){q(`Error: ${e instanceof Error?e.message:String(e)}`,`error`),console.error(e)}finally{Y(!0)}}function ne(){I.addEventListener(`click`,ee),L.addEventListener(`click`,te)}G(); \ No newline at end of file diff --git a/docs/public/playground/index.html b/docs/public/playground/index.html index 13387c8..6dcc65a 100644 --- a/docs/public/playground/index.html +++ b/docs/public/playground/index.html @@ -237,7 +237,7 @@ } } - +
diff --git a/src/benchmark/Benchmark.ts b/src/benchmark/Benchmark.ts index e7102b6..f44671b 100644 --- a/src/benchmark/Benchmark.ts +++ b/src/benchmark/Benchmark.ts @@ -62,24 +62,31 @@ export class Benchmark { const times: number[] = []; const gpuTimes: number[] = []; + // Preallocate GPU buffers for the target size so iterations measure + // steady-state sort performance (buffer reuse) rather than allocation. + let sorter: BitonicSorter | RadixSorter | null = null; + if (algorithm === 'bitonic') { + if (!this.bitonicSorter) { + this.bitonicSorter = new BitonicSorter(this.context); + } + sorter = this.bitonicSorter; + this.bitonicSorter.preallocate(size); + } else if (algorithm === 'radix') { + if (!this.radixSorter) { + this.radixSorter = new RadixSorter(this.context); + } + sorter = this.radixSorter; + this.radixSorter.preallocate(size); + } + for (let i = 0; i < iterations; i++) { const data = Benchmark.generateRandomData(size); if (algorithm === 'js-native') { const time = this.runNativeSort(data); times.push(time); - } else if (algorithm === 'bitonic') { - if (!this.bitonicSorter) { - this.bitonicSorter = new BitonicSorter(this.context); - } - const result = await this.bitonicSorter.sort(data); - times.push(result.totalTimeMs); - gpuTimes.push(result.gpuTimeMs); - } else if (algorithm === 'radix') { - if (!this.radixSorter) { - this.radixSorter = new RadixSorter(this.context); - } - const result = await this.radixSorter.sort(data); + } else if (sorter) { + const result = await sorter.sort(data); times.push(result.totalTimeMs); gpuTimes.push(result.gpuTimeMs); } diff --git a/src/shaders/scan.wgsl b/src/shaders/scan.wgsl index 0bb0baf..9de7b4f 100644 --- a/src/shaders/scan.wgsl +++ b/src/shaders/scan.wgsl @@ -216,23 +216,28 @@ fn scan_block_sums( // Add block prefixes to each block's local scan results // This is the third step of two-level scan +// Each thread handles 2 elements (matching blelloch_scan's 512 elements per workgroup) @compute @workgroup_size(SCAN_WORKGROUP_SIZE) fn add_block_prefixes( - @builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) workgroup_id: vec3 ) { let tid = local_id.x; - let gid = global_id.x; let block_id = workgroup_id.x; let n = scan_uniforms.data_size; // Get the prefix for this block (sum of all previous blocks) let block_prefix = block_sums[block_id]; - // Add block prefix to each element in this block - let idx = gid; - if (idx < n) { - scan_output[idx] = scan_output[idx] + block_prefix; + // Add block prefix to each element in this block (2 elements per thread) + let block_start = block_id * (SCAN_WORKGROUP_SIZE * 2u); + let idx0 = block_start + tid; + let idx1 = block_start + tid + SCAN_WORKGROUP_SIZE; + + if (idx0 < n) { + scan_output[idx0] = scan_output[idx0] + block_prefix; + } + if (idx1 < n) { + scan_output[idx1] = scan_output[idx1] + block_prefix; } } \ No newline at end of file diff --git a/src/shared/random.ts b/src/shared/random.ts index 362920e..8386e25 100644 --- a/src/shared/random.ts +++ b/src/shared/random.ts @@ -10,9 +10,10 @@ export function fillRandomUint32Array(data: Uint32Array): Uint32Array { if (typeof crypto !== 'undefined' && typeof crypto.getRandomValues === 'function') { for (let offset = 0; offset < data.length; offset += MAX_CRYPTO_FILL_U32) { const chunkLength = Math.min(MAX_CRYPTO_FILL_U32, data.length - offset); - const chunk = new Uint32Array(chunkLength); - crypto.getRandomValues(chunk); - data.set(chunk, offset); + // Fill in-place via subarray view — avoids per-chunk allocation + copy + crypto.getRandomValues( + data.subarray(offset, offset + chunkLength) as Uint32Array + ); } return data; } diff --git a/src/sorting/BitonicSorter.ts b/src/sorting/BitonicSorter.ts index 2826700..44bf732 100644 --- a/src/sorting/BitonicSorter.ts +++ b/src/sorting/BitonicSorter.ts @@ -244,49 +244,61 @@ export class BitonicSorter { const numWorkgroups = Math.ceil(paddedSize / WORKGROUP_SIZE); // Safe integer log2 - paddedSize is guaranteed to be power of 2 const numStages = Math.trunc(Math.log2(paddedSize)); - - // First, do local sort within each workgroup - { - const localPipeline = this.localPipeline; - if (!localPipeline) { - throw new ShaderCompilationError('Local pipeline not initialized'); - } - - const uniformData = new Uint32Array([0, 0, paddedSize, 0]); - this.device.queue.writeBuffer(uniformBuffer, 0, uniformData); - - const commandEncoder = this.device.createCommandEncoder(); - const passEncoder = commandEncoder.beginComputePass(); - passEncoder.setPipeline(localPipeline); - passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(numWorkgroups); - passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); - } - - // Then do global merge stages // Safe integer log2 - WORKGROUP_SIZE is guaranteed to be power of 2 const localStages = Math.trunc(Math.log2(WORKGROUP_SIZE)); + + const localPipeline = this.localPipeline; const globalPipeline = this.globalPipeline; - if (!globalPipeline) { - throw new ShaderCompilationError('Global pipeline not initialized'); + if (!localPipeline || !globalPipeline) { + throw new ShaderCompilationError('Sort pipelines not initialized'); } + // Pre-compute all uniform values (local pass + all global passes) into a + // single buffer, then batch every dispatch into one command encoder with + // copyBufferToBuffer updating the uniform between passes. This eliminates + // per-pass queue submissions (can be 100+ for large arrays). + const passes: Array<{ stage: number; passNum: number; isLocal: boolean }> = [ + { stage: 0, passNum: 0, isLocal: true }, + ]; for (let stage = localStages; stage < numStages; stage++) { for (let passNum = stage; passNum >= 0; passNum--) { - const uniformData = new Uint32Array([stage, passNum, paddedSize, 0]); - this.device.queue.writeBuffer(uniformBuffer, 0, uniformData); - - const commandEncoder = this.device.createCommandEncoder(); - const passEncoder = commandEncoder.beginComputePass(); - passEncoder.setPipeline(globalPipeline); - passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(numWorkgroups); - passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); + passes.push({ stage, passNum, isLocal: false }); } } + const uniformData = new Uint32Array(passes.length * 4); + for (let i = 0; i < passes.length; i++) { + const p = passes[i]; + uniformData[i * 4] = p.stage; + uniformData[i * 4 + 1] = p.passNum; + uniformData[i * 4 + 2] = paddedSize; + uniformData[i * 4 + 3] = 0; + } + + const uniformDataBuffer = bufferScope.track( + this.device.createBuffer({ + label: 'bitonic-uniform-data', + size: BufferManager.alignSize(uniformData.byteLength, 4), + usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST, + }) + ); + this.device.queue.writeBuffer(uniformDataBuffer, 0, uniformData); + + // Single command encoder for all passes — compute passes within an + // encoder are ordered and each sees the writes of previous passes. + const commandEncoder = this.device.createCommandEncoder(); + for (let i = 0; i < passes.length; i++) { + // Update uniform for this pass via encoder-level copy (ordered) + commandEncoder.copyBufferToBuffer(uniformDataBuffer, i * 16, uniformBuffer, 0, 16); + + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(passes[i].isLocal ? localPipeline : globalPipeline); + passEncoder.setBindGroup(0, bindGroup); + passEncoder.dispatchWorkgroups(numWorkgroups); + passEncoder.end(); + } + this.device.queue.submit([commandEncoder.finish()]); + // Wait for GPU to finish await this.device.queue.onSubmittedWorkDone(); diff --git a/src/sorting/RadixSorter.ts b/src/sorting/RadixSorter.ts index a3c6403..adb4aee 100644 --- a/src/sorting/RadixSorter.ts +++ b/src/sorting/RadixSorter.ts @@ -278,12 +278,14 @@ export class RadixSorter { const gpuStartTime = performance.now(); + // Reusable zero buffer for histogram clearing (avoids per-pass allocation) + const zeroHistogram = new Uint32Array(histogramSize); + // Perform 8 passes (4 bits each) for (let pass = 0; pass < NUM_PASSES; pass++) { const bitOffset = pass * BITS_PER_PASS; // Clear histogram - const zeroHistogram = new Uint32Array(histogramSize); this.device.queue.writeBuffer(histogramBuffer, 0, zeroHistogram); // Update uniforms diff --git a/src/sorting/scan/ScanModule.ts b/src/sorting/scan/ScanModule.ts index 7b3f247..a05ccaf 100644 --- a/src/sorting/scan/ScanModule.ts +++ b/src/sorting/scan/ScanModule.ts @@ -3,6 +3,12 @@ * * This module provides a dedicated scan interface for computing * exclusive prefix sums on the GPU using the Blelloch algorithm. + * + * Architecture: + * - Three dedicated bind group layouts (one per pipeline) to avoid + * read-only/read-write binding hazards on the same buffer. + * - Recursive multi-level scan for arbitrarily large inputs: block sums + * are scanned recursively until they fit in a single workgroup. */ import { GPUContext } from '../../core/GPUContext'; @@ -15,6 +21,8 @@ import scanShaderCode from '../../shaders/scan.wgsl?raw'; const SCAN_WORKGROUP_SIZE = 256; /** Elements processed per scan workgroup (each thread handles 2 elements) */ const ELEMENTS_PER_SCAN_BLOCK = SCAN_WORKGROUP_SIZE * 2; +/** Maximum block sums a single scan_block_sums workgroup can process */ +const MAX_LEAF_BLOCK_SUMS = ELEMENTS_PER_SCAN_BLOCK; // 512 /** * GPU-based exclusive prefix sum module using Blelloch scan @@ -27,7 +35,11 @@ export class ScanModule { private blellochScanPipeline: GPUComputePipeline | null = null; private scanBlockSumsPipeline: GPUComputePipeline | null = null; private addBlockPrefixesPipeline: GPUComputePipeline | null = null; - private scanBindGroupLayout: GPUBindGroupLayout | null = null; + + // Dedicated bind group layouts per pipeline (avoids binding hazards) + private scanLayout: GPUBindGroupLayout | null = null; // bindings 0,1,2,3 + private blockSumsScanLayout: GPUBindGroupLayout | null = null; // bindings 2,3 + private addPrefixesLayout: GPUBindGroupLayout | null = null; // bindings 1,2,3 private initialized = false; @@ -55,9 +67,9 @@ export class ScanModule { ); } - // Create scan bind group layout - this.scanBindGroupLayout = this.device.createBindGroupLayout({ - label: 'scan-bind-group-layout', + // Layout for blelloch_scan: input(0, read), output(1, rw), blockSums(2, rw), uniforms(3) + this.scanLayout = this.device.createBindGroupLayout({ + label: 'scan-layout', entries: [ { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } }, { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, @@ -66,37 +78,41 @@ export class ScanModule { ], }); - const scanPipelineLayout = this.device.createPipelineLayout({ - label: 'scan-pipeline-layout', - bindGroupLayouts: [this.scanBindGroupLayout], + // Layout for scan_block_sums: blockSums(2, rw), uniforms(3) + this.blockSumsScanLayout = this.device.createBindGroupLayout({ + label: 'block-sums-scan-layout', + entries: [ + { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, + { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } }, + ], + }); + + // Layout for add_block_prefixes: output(1, rw), blockSums(2, rw), uniforms(3) + this.addPrefixesLayout = this.device.createBindGroupLayout({ + label: 'add-prefixes-layout', + entries: [ + { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, + { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, + { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } }, + ], }); - // Create Blelloch scan pipelines this.blellochScanPipeline = this.device.createComputePipeline({ label: 'blelloch-scan-pipeline', - layout: scanPipelineLayout, - compute: { - module: shaderModule, - entryPoint: 'blelloch_scan', - }, + layout: this.device.createPipelineLayout({ bindGroupLayouts: [this.scanLayout] }), + compute: { module: shaderModule, entryPoint: 'blelloch_scan' }, }); this.scanBlockSumsPipeline = this.device.createComputePipeline({ label: 'scan-block-sums-pipeline', - layout: scanPipelineLayout, - compute: { - module: shaderModule, - entryPoint: 'scan_block_sums', - }, + layout: this.device.createPipelineLayout({ bindGroupLayouts: [this.blockSumsScanLayout] }), + compute: { module: shaderModule, entryPoint: 'scan_block_sums' }, }); this.addBlockPrefixesPipeline = this.device.createComputePipeline({ label: 'add-block-prefixes-pipeline', - layout: scanPipelineLayout, - compute: { - module: shaderModule, - entryPoint: 'add_block_prefixes', - }, + layout: this.device.createPipelineLayout({ bindGroupLayouts: [this.addPrefixesLayout] }), + compute: { module: shaderModule, entryPoint: 'add_block_prefixes' }, }); this.initialized = true; @@ -105,9 +121,9 @@ export class ScanModule { /** * Compute exclusive prefix sum on GPU using Blelloch scan * - * Uses a two-level scan for large arrays: - * 1. Local scan within each workgroup - * 2. Scan of block sums + * Uses a recursive multi-level scan for large arrays: + * 1. Local scan within each workgroup (512 elements each) + * 2. Recursive exclusive prefix sum of block sums * 3. Add block prefixes to local results * * @param input - Input array to compute prefix sum for @@ -120,7 +136,6 @@ export class ScanModule { const dataSize = input.length; - // Handle edge cases if (dataSize === 0) { return new Uint32Array(0); } @@ -133,7 +148,6 @@ export class ScanModule { const bufferScope = new BufferScope(); try { - // Create buffers const inputBuffer = bufferScope.track( this.bufferManager.createStorageBuffer(input, 'scan-input'), (buffer) => this.bufferManager.releaseBuffer(buffer) @@ -160,13 +174,10 @@ export class ScanModule { (buffer) => this.bufferManager.releaseBuffer(buffer) ); - // Perform the scan this.computePrefixSumGPU(inputBuffer, outputBuffer, blockSumsBuffer, uniformBuffer, dataSize); - // Wait for GPU to finish await this.device.queue.onSubmittedWorkDone(); - // Read results const result = await this.bufferManager.readBuffer(outputBuffer, dataSize * 4); return result; @@ -176,9 +187,11 @@ export class ScanModule { } /** - * Internal method to compute prefix sum on GPU + * Internal method to compute prefix sum on GPU. * * This is exposed for use by RadixSorter which already has buffers allocated. + * Commands are queued synchronously; the caller is responsible for waiting + * on `device.queue.onSubmittedWorkDone()` before reading results. */ computePrefixSumGPU( inputBuffer: GPUBuffer, @@ -187,35 +200,28 @@ export class ScanModule { uniformBuffer: GPUBuffer, dataSize: number ): void { - const scanBindGroupLayout = this.scanBindGroupLayout; + const scanLayout = this.scanLayout; const blellochPipeline = this.blellochScanPipeline; - const scanBlockSumsPipeline = this.scanBlockSumsPipeline; const addBlockPrefixesPipeline = this.addBlockPrefixesPipeline; + const addPrefixesLayout = this.addPrefixesLayout; - if ( - !scanBindGroupLayout || - !blellochPipeline || - !scanBlockSumsPipeline || - !addBlockPrefixesPipeline - ) { + if (!scanLayout || !blellochPipeline || !addBlockPrefixesPipeline || !addPrefixesLayout) { throw new ShaderCompilationError('Scan pipelines not initialized'); } - // Calculate number of scan blocks const numScanBlocks = Math.ceil(dataSize / ELEMENTS_PER_SCAN_BLOCK); - // Update scan uniforms - const scanUniformData = new Uint32Array([dataSize, numScanBlocks, 0, 0]); - this.device.queue.writeBuffer(uniformBuffer, 0, scanUniformData); - - // Use a single command encoder for all dispatches to ensure proper ordering - const commandEncoder = this.device.createCommandEncoder(); + // Uniform: [data_size, num_blocks, 0, 0] + // - blelloch_scan uses both data_size and num_blocks + // - scan_block_sums uses num_blocks (as element count) + // - add_block_prefixes uses data_size (for bounds check) + this.writeUniform(uniformBuffer, dataSize, numScanBlocks); // Step 1: Local Blelloch scan within each workgroup { const bindGroup = this.device.createBindGroup({ label: 'blelloch-scan-bind-group', - layout: scanBindGroupLayout, + layout: scanLayout, entries: [ { binding: 0, resource: { buffer: inputBuffer } }, { binding: 1, resource: { buffer: outputBuffer } }, @@ -224,55 +230,186 @@ export class ScanModule { ], }); + const commandEncoder = this.device.createCommandEncoder(); const passEncoder = commandEncoder.beginComputePass(); passEncoder.setPipeline(blellochPipeline); passEncoder.setBindGroup(0, bindGroup); passEncoder.dispatchWorkgroups(numScanBlocks); passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); } - // Step 2: Scan the block sums (if more than one block) + // Step 2 + 3: Scan block sums and add prefixes (if more than one block) if (numScanBlocks > 1) { - const bindGroup = this.device.createBindGroup({ - label: 'scan-block-sums-bind-group', - layout: scanBindGroupLayout, - entries: [ - { binding: 0, resource: { buffer: blockSumsBuffer } }, - { binding: 1, resource: { buffer: blockSumsBuffer } }, - { binding: 2, resource: { buffer: blockSumsBuffer } }, - { binding: 3, resource: { buffer: uniformBuffer } }, - ], - }); + this.scanBlockSumsRecursive(blockSumsBuffer, numScanBlocks, uniformBuffer); - const passEncoder = commandEncoder.beginComputePass(); - passEncoder.setPipeline(scanBlockSumsPipeline); - passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(1); - passEncoder.end(); + // Re-write uniform (recursion may have overwritten it) for add_block_prefixes + this.writeUniform(uniformBuffer, dataSize, numScanBlocks); - // Step 3: Add block prefixes to each block's local results { const bindGroup = this.device.createBindGroup({ label: 'add-block-prefixes-bind-group', - layout: scanBindGroupLayout, + layout: addPrefixesLayout, entries: [ - { binding: 0, resource: { buffer: inputBuffer } }, { binding: 1, resource: { buffer: outputBuffer } }, { binding: 2, resource: { buffer: blockSumsBuffer } }, { binding: 3, resource: { buffer: uniformBuffer } }, ], }); + const commandEncoder = this.device.createCommandEncoder(); const passEncoder = commandEncoder.beginComputePass(); passEncoder.setPipeline(addBlockPrefixesPipeline); passEncoder.setBindGroup(0, bindGroup); passEncoder.dispatchWorkgroups(numScanBlocks); passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); } } + } + + /** + * Recursively compute exclusive prefix sum on the block sums buffer in-place. + * + * - If count <= MAX_LEAF_BLOCK_SUMS (512): single-workgroup in-place scan. + * - Otherwise: split into sub-blocks, scan recursively, add prefixes, copy back. + * + * Temp buffers are allocated and destroyed within this method. Per the WebGPU + * spec, destroying a buffer after submitting commands that reference it is + * safe — the GPU retains the memory until queued work completes. + */ + private scanBlockSumsRecursive(buffer: GPUBuffer, count: number, uniformBuffer: GPUBuffer): void { + const scanBlockSumsPipeline = this.scanBlockSumsPipeline; + const scanLayout = this.scanLayout; + const addBlockPrefixesPipeline = this.addBlockPrefixesPipeline; + const blockSumsScanLayout = this.blockSumsScanLayout; + const addPrefixesLayout = this.addPrefixesLayout; + const blellochScanPipeline = this.blellochScanPipeline; - // Submit all commands together - this.device.queue.submit([commandEncoder.finish()]); + if ( + !scanBlockSumsPipeline || + !scanLayout || + !addBlockPrefixesPipeline || + !blockSumsScanLayout || + !addPrefixesLayout || + !blellochScanPipeline + ) { + throw new ShaderCompilationError('Scan pipelines not initialized'); + } + + // Leaf case: single workgroup can handle all block sums in-place + if (count <= MAX_LEAF_BLOCK_SUMS) { + // scan_block_sums uses num_blocks as the element count + this.writeUniform(uniformBuffer, count, count); + + const bindGroup = this.device.createBindGroup({ + label: 'scan-block-sums-bind-group', + layout: blockSumsScanLayout, + entries: [ + { binding: 2, resource: { buffer } }, + { binding: 3, resource: { buffer: uniformBuffer } }, + ], + }); + + const commandEncoder = this.device.createCommandEncoder(); + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(scanBlockSumsPipeline); + passEncoder.setBindGroup(0, bindGroup); + passEncoder.dispatchWorkgroups(1); + passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); + return; + } + + // Recursive case: split into sub-blocks + const numSubBlocks = Math.ceil(count / ELEMENTS_PER_SCAN_BLOCK); + + const tempOutput = this.device.createBuffer({ + label: 'scan-recursive-temp-output', + size: BufferManager.alignSize(count * 4, 4), + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST, + }); + + const tempBlockSums = this.device.createBuffer({ + label: 'scan-recursive-temp-block-sums', + size: BufferManager.alignSize(numSubBlocks * 4, 4), + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST, + }); + + // Sub-step 1: local scan of block sums into tempOutput + this.writeUniform(uniformBuffer, count, numSubBlocks); + + { + const bindGroup = this.device.createBindGroup({ + label: 'recursive-blelloch-scan-bind-group', + layout: scanLayout, + entries: [ + { binding: 0, resource: { buffer } }, + { binding: 1, resource: { buffer: tempOutput } }, + { binding: 2, resource: { buffer: tempBlockSums } }, + { binding: 3, resource: { buffer: uniformBuffer } }, + ], + }); + + const commandEncoder = this.device.createCommandEncoder(); + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(blellochScanPipeline); + passEncoder.setBindGroup(0, bindGroup); + passEncoder.dispatchWorkgroups(numSubBlocks); + passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); + } + + // Sub-step 2: recursively scan the sub-block sums + this.scanBlockSumsRecursive(tempBlockSums, numSubBlocks, uniformBuffer); + + // Sub-step 3: add sub-block prefixes to tempOutput + this.writeUniform(uniformBuffer, count, numSubBlocks); + + { + const bindGroup = this.device.createBindGroup({ + label: 'recursive-add-prefixes-bind-group', + layout: addPrefixesLayout, + entries: [ + { binding: 1, resource: { buffer: tempOutput } }, + { binding: 2, resource: { buffer: tempBlockSums } }, + { binding: 3, resource: { buffer: uniformBuffer } }, + ], + }); + + const commandEncoder = this.device.createCommandEncoder(); + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(addBlockPrefixesPipeline); + passEncoder.setBindGroup(0, bindGroup); + passEncoder.dispatchWorkgroups(numSubBlocks); + passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); + } + + // Sub-step 4: copy scanned result back into the original buffer + { + const commandEncoder = this.device.createCommandEncoder(); + commandEncoder.copyBufferToBuffer( + tempOutput, + 0, + buffer, + 0, + BufferManager.alignSize(count * 4, 4) + ); + this.device.queue.submit([commandEncoder.finish()]); + } + + // Release temp buffers (safe after submit per WebGPU spec) + tempOutput.destroy(); + tempBlockSums.destroy(); + } + + /** + * Write scan uniform data [dataSize, numBlocks, 0, 0] + */ + private writeUniform(uniformBuffer: GPUBuffer, dataSize: number, numBlocks: number): void { + const data = new Uint32Array([dataSize, numBlocks, 0, 0]); + this.device.queue.writeBuffer(uniformBuffer, 0, data); } /** @@ -296,7 +433,9 @@ export class ScanModule { this.blellochScanPipeline = null; this.scanBlockSumsPipeline = null; this.addBlockPrefixesPipeline = null; - this.scanBindGroupLayout = null; + this.scanLayout = null; + this.blockSumsScanLayout = null; + this.addPrefixesLayout = null; this.initialized = false; } }