diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..767d84610 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,4 @@ +# Agent Rules + +- Before pushing code or conflict-resolution commits to a PR branch, run `make lint` from the repository root. `cargo fmt --check` is not a substitute because CI's `Lint` job runs the full `make lint` target. +- If `make lint` cannot complete, do not push unless the user explicitly accepts that risk, and report the incomplete lint result. diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu index c22bc4d05..32dd75d47 100644 --- a/crypto/math-cuda/kernels/keccak.cu +++ b/crypto/math-cuda/kernels/keccak.cu @@ -159,8 +159,8 @@ extern "C" __global__ void keccak256_leaves_base_batched( uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; if (tid >= num_rows) return; - // Bit-reverse the row index so we read columns at `br` but write the - // hashed leaf at `tid` — matching the CPU `commit_columns_bit_reversed`. + // Bit-reverse the row index so we read columns at `br` but write the hashed + // leaf at `tid` — matching the CPU per-row `commit_bit_reversed(.., 1)`. uint64_t br = __brevll(tid) >> (64 - log_num_rows); uint64_t st[25]; @@ -181,6 +181,51 @@ extern "C" __global__ void keccak256_leaves_base_batched( finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); } +// --------------------------------------------------------------------------- +// Goldilocks BASE-FIELD row-pair leaf hashing. +// +// Leaf `leaf_idx` hashes TWO consecutive bit-reversed rows +// br_0 = reverse_index(2*leaf_idx), br_1 = reverse_index(2*leaf_idx + 1) +// each written column-by-column in canonical BE (same per-row byte layout as +// `keccak256_leaves_base_batched`), in (br_0 row: col 0..K-1) then (br_1 row: +// col 0..K-1) order. `num_leaves = num_rows / 2`; writes 32 bytes to +// `hashed_leaves_out[leaf_idx * 32 ..]`. Matches the CPU +// `keccak_leaves_row_pair_bit_reversed` (rows_per_leaf = 2) — the base-field +// analog of `keccak_comp_poly_leaves_ext3`. +// --------------------------------------------------------------------------- +extern "C" __global__ void keccak256_leaves_base_row_pair_batched( + const uint64_t *columns_base_ptr, + uint64_t col_stride, + uint64_t num_cols, + uint64_t num_rows, + uint64_t log_num_rows, + uint8_t *hashed_leaves_out) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + uint64_t num_leaves = num_rows >> 1; + if (tid >= num_leaves) return; + + uint64_t br_0 = __brevll(2 * tid) >> (64 - log_num_rows); + uint64_t br_1 = __brevll(2 * tid + 1) >> (64 - log_num_rows); + + uint64_t st[25]; + #pragma unroll + for (int i = 0; i < 25; ++i) st[i] = 0; + + uint32_t rate_pos = 0; + // First row (br_0): col 0..K-1. + for (uint64_t c = 0; c < num_cols; ++c) { + uint64_t v = columns_base_ptr[c * col_stride + br_0]; + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v))); + } + // Second row (br_1): col 0..K-1. + for (uint64_t c = 0; c < num_cols; ++c) { + uint64_t v = columns_base_ptr[c * col_stride + br_1]; + absorb_lane(st, rate_pos, bswap64(goldilocks::canonical(v))); + } + + finalize_keccak256(st, rate_pos, hashed_leaves_out + tid * 32); +} + // --------------------------------------------------------------------------- // Goldilocks EXT3 leaf hashing (3 base-field components per ext3 element). // diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 17e2f9f82..d2a98652b 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -143,6 +143,7 @@ pub struct Backend { // keccak.ptx pub keccak256_leaves_base_batched: CudaFunction, + pub keccak256_leaves_base_row_pair_batched: CudaFunction, pub keccak256_leaves_ext3_batched: CudaFunction, pub keccak_comp_poly_leaves_ext3: CudaFunction, pub keccak_fri_leaves_ext3: CudaFunction, @@ -238,6 +239,8 @@ impl Backend { pointwise_mul_batched: ntt.load_function("pointwise_mul_batched")?, scalar_mul_batched: ntt.load_function("scalar_mul_batched")?, keccak256_leaves_base_batched: keccak.load_function("keccak256_leaves_base_batched")?, + keccak256_leaves_base_row_pair_batched: keccak + .load_function("keccak256_leaves_base_row_pair_batched")?, keccak256_leaves_ext3_batched: keccak.load_function("keccak256_leaves_ext3_batched")?, keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?, keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?, diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index ee5dc3fce..c3f59fd90 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -16,7 +16,10 @@ use cudarc::driver::{CudaSlice, CudaStream, LaunchConfig, PushKernelArg}; use crate::Result; use crate::device::{Backend, backend}; -use crate::merkle::{keccak_launch_cfg, launch_keccak_base, launch_keccak_ext3}; +use crate::merkle::{ + keccak_launch_cfg, launch_keccak_base, launch_keccak_base_row_pair, launch_keccak_ext3, + launch_keccak_ext3_row_pair, +}; use crate::ntt::run_ntt_body; /// Goldilocks `TWO_ADICITY = 32` puts the theoretical domain ceiling at @@ -34,26 +37,26 @@ fn assert_u32_domain(n: usize, what: &str) { /// Output shape requested from the fused LDE + Keccak entry points. #[derive(Copy, Clone, PartialEq, Eq)] enum KeccakCommit { - /// Only the `lde_size` keccak-256 leaves; no inner-tree build. Caller - /// receives `lde_size * 32` bytes. + /// Only the keccak-256 leaves; no inner-tree build. Caller receives + /// `num_leaves * 32` bytes. LeavesOnly, /// Full Merkle tree: leaves at the tail + inner nodes built on-device. - /// Caller receives `(2*lde_size - 1) * 32` bytes. + /// Caller receives `(2*num_leaves - 1) * 32` bytes. FullTree, } impl KeccakCommit { - fn total_nodes_bytes(self, lde_size: usize) -> usize { + fn total_nodes_bytes(self, num_leaves: usize) -> usize { match self { - KeccakCommit::LeavesOnly => lde_size * 32, - KeccakCommit::FullTree => (2 * lde_size - 1) * 32, + KeccakCommit::LeavesOnly => num_leaves * 32, + KeccakCommit::FullTree => (2 * num_leaves - 1) * 32, } } - fn leaves_offset_bytes(self, lde_size: usize) -> usize { + fn leaves_offset_bytes(self, num_leaves: usize) -> usize { match self { KeccakCommit::LeavesOnly => 0, - KeccakCommit::FullTree => (lde_size - 1) * 32, + KeccakCommit::FullTree => (num_leaves - 1) * 32, } } } @@ -592,12 +595,12 @@ pub fn coset_lde_batch_base_into( Ok(()) } -/// Fused LDE + Keccak-256 leaf hashing. Caller receives the `lde_size * 32` -/// bytes of leaf hashes in `hashed_leaves_out` (one 32-byte digest per output -/// row, in natural row order; leaves are computed reading columns at -/// bit-reversed rows, matching `commit_columns_bit_reversed` on the CPU -/// side). Thin wrapper over `coset_lde_batch_base_into_with_merkle_tree_inner` -/// with `LeavesOnly` — no inner-tree build, no device handle. +/// Fused LDE + row-pair Keccak-256 leaf hashing. Caller receives +/// `(lde_size / 2) * 32` bytes of leaf hashes in `hashed_leaves_out` (one +/// 32-byte digest per bit-reversed row pair, in natural leaf order, matching +/// `commit_bit_reversed(.., 2)` on the CPU side). Thin wrapper over +/// `coset_lde_batch_base_into_with_merkle_tree_inner` with `LeavesOnly` — no +/// inner-tree build, no device handle. pub fn coset_lde_batch_base_into_with_leaf_hash( columns: &[&[u64]], blowup_factor: usize, @@ -613,13 +616,15 @@ pub fn coset_lde_batch_base_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, + 2, ) .map(|_| ()) } /// Like `coset_lde_batch_base_into_with_leaf_hash`, but also builds the full -/// Merkle tree on device and returns the `2*lde_size - 1` node buffer back -/// to the caller in `merkle_nodes_out` (byte length `(2*lde_size - 1) * 32`). +/// row-pair Merkle tree on device and returns the `2*(lde_size/2) - 1` node +/// buffer back to the caller in `merkle_nodes_out` (byte length +/// `(2*(lde_size/2) - 1) * 32`). /// /// The leaf hashes are never exposed to the caller — they stay on device and /// feed straight into the pair-hash tree kernel, avoiding the @@ -640,6 +645,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, + 2, ) .map(|_| ()) } @@ -662,11 +668,15 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep( merkle_nodes_out, KeccakCommit::FullTree, true, + // Trace commit: one leaf per bit-reversed row pair, matching the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and `verify_opening_pair`. + 2, )?; let handle = opt.expect("keep_device_buf=true must return Some"); Ok(handle) } +#[allow(clippy::too_many_arguments)] fn coset_lde_batch_base_into_with_merkle_tree_inner( columns: &[&[u64]], blowup_factor: usize, @@ -675,6 +685,9 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( nodes_out: &mut [u8], commit: KeccakCommit, keep_device_buf: bool, + // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1), + // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit. + rows_per_leaf: usize, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -698,7 +711,13 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( for o in outputs.iter() { assert_eq!(o.len(), lde_size); } - let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); + assert!( + rows_per_leaf == 1 || rows_per_leaf == 2, + "rows_per_leaf must be 1 or 2" + ); + assert_eq!(lde_size % rows_per_leaf, 0); + let num_leaves = lde_size / rows_per_leaf; + let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves); assert_eq!(nodes_out.len(), nodes_dev_bytes); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -781,28 +800,39 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( )?; // Allocate the device output buffer. In `LeavesOnly` mode this is just - // `lde_size * 32` bytes (the leaves themselves); in `FullTree` mode it's - // `(2*lde_size - 1) * 32` bytes (leaves in the tail + inner nodes filled + // `num_leaves * 32` bytes (the leaves themselves); in `FullTree` mode it's + // `(2*num_leaves - 1) * 32` bytes (leaves in the tail + inner nodes filled // below). `alloc` (not `alloc_zeros`) is safe because every byte is // written before any reader sees it: the keccak kernel fills the // leaves slab, the inner-tree pass (when present) fills the head. let mut nodes_dev = unsafe { stream.alloc::(nodes_dev_bytes) }?; - let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size); + let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves); { let mut leaves_view = - nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32); - launch_keccak_base( - stream.as_ref(), - &buf, - col_stride_u64, - m as u64, - lde_u64, - &mut leaves_view, - )?; + nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32); + if rows_per_leaf == 2 { + launch_keccak_base_row_pair( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } else { + launch_keccak_base( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } } if commit == KeccakCommit::FullTree { - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; + crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; } // D2H the LDE and the tree/leaves nodes via pinned staging. @@ -828,8 +858,8 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( } } -/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused -/// LDE + Keccak-256 leaf hashing over ext3 columns. Thin wrapper over +/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused LDE + +/// row-pair Keccak-256 leaf hashing over ext3 columns. Thin wrapper over /// `coset_lde_batch_ext3_into_with_merkle_tree_inner` with `LeavesOnly`. pub fn coset_lde_batch_ext3_into_with_leaf_hash( columns: &[&[u64]], @@ -848,13 +878,14 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, + 2, ) .map(|_| ()) } /// Ext3 variant of the fused `coset_lde_batch_base_into_with_merkle_tree`. /// LDE + leaf hashing + inner-tree build, all on device; D2Hs only the LDE -/// evaluations and the full `2*lde_size - 1` node buffer. +/// evaluations and the full `2*(lde_size/2) - 1` row-pair node buffer. pub fn coset_lde_batch_ext3_into_with_merkle_tree( columns: &[&[u64]], n: usize, @@ -872,6 +903,7 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, + 2, ) .map(|_| ()) } @@ -896,6 +928,9 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep( merkle_nodes_out, KeccakCommit::FullTree, true, + // Trace commit: one leaf per bit-reversed row pair, matching the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF=2)` and `verify_opening_pair`. + 2, )?; Ok(opt.expect("keep_device_buf=true must return Some")) } @@ -910,6 +945,9 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( nodes_out: &mut [u8], commit: KeccakCommit, keep_device_buf: bool, + // 1 = one leaf per bit-reversed row; 2 = one leaf per row pair (2i, 2i+1), + // matching the CPU `commit_bit_reversed(.., 2)` used for the trace commit. + rows_per_leaf: usize, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -935,7 +973,13 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( for o in outputs.iter() { assert_eq!(o.len(), 3 * lde_size); } - let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); + assert!( + rows_per_leaf == 1 || rows_per_leaf == 2, + "rows_per_leaf must be 1 or 2" + ); + assert_eq!(lde_size % rows_per_leaf, 0); + let num_leaves = lde_size / rows_per_leaf; + let nodes_dev_bytes = commit.total_nodes_bytes(num_leaves); assert_eq!(nodes_out.len(), nodes_dev_bytes); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -1012,26 +1056,37 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( mb_u32, )?; - // Allocate device output buffer (LeavesOnly → lde_size*32; FullTree → - // (2*lde_size - 1)*32). Leaf kernel writes to the leaves slab; the + // Allocate device output buffer (LeavesOnly -> num_leaves*32; FullTree -> + // (2*num_leaves - 1)*32). Leaf kernel writes to the leaves slab; the // inner-tree pass (when present) fills the head. let mut nodes_dev = unsafe { stream.alloc::(nodes_dev_bytes) }?; - let leaves_offset_bytes = commit.leaves_offset_bytes(lde_size); + let leaves_offset_bytes = commit.leaves_offset_bytes(num_leaves); { let mut leaves_view = - nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + lde_size * 32); - launch_keccak_ext3( - stream.as_ref(), - &buf, - col_stride_u64, - m as u64, - lde_u64, - &mut leaves_view, - )?; + nodes_dev.slice_mut(leaves_offset_bytes..leaves_offset_bytes + num_leaves * 32); + if rows_per_leaf == 2 { + launch_keccak_ext3_row_pair( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } else { + launch_keccak_ext3( + stream.as_ref(), + &buf, + col_stride_u64, + m as u64, + lde_u64, + &mut leaves_view, + )?; + } } if commit == KeccakCommit::FullTree { - crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; + crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; } // D2H LDE (mb * lde_size u64) and tree/leaves nodes. diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 932e81325..27f38ce0a 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -3,7 +3,7 @@ //! Matches `FieldElementVectorBackend::hash_data` in //! `crypto/crypto/src/merkle_tree/backends/field_element_vector.rs`, combined //! with the `reverse_index` row read pattern used in -//! `commit_columns_bit_reversed` at `crypto/stark/src/prover.rs`. +//! `commit_bit_reversed` at `crypto/stark/src/commitment.rs`. //! //! Caller supplies base-field column slabs already laid out as //! `[col * col_stride + row]` (the same layout `coset_lde_batch_base_into` @@ -25,15 +25,27 @@ use crate::lde::pack_ext3_to_pinned_slabs; /// Run GPU Keccak-256 leaf hashing on a base-field column buffer. /// /// `columns` must hold `num_cols * col_stride` u64s with column `c`'s data -/// at `[c*col_stride .. c*col_stride + num_rows]`. Returns `num_rows * 32` -/// hash bytes in natural (non-bit-reversed) row order. +/// at `[c*col_stride .. c*col_stride + num_rows]`. `rows_per_leaf` selects the +/// leaf layout: `1` = one leaf per bit-reversed row (`num_rows` leaves), `2` = +/// one leaf per bit-reversed row pair `2i`,`2i+1` (`num_rows/2` leaves, the +/// trace-commit layout). Returns `(num_rows / rows_per_leaf) * 32` hash bytes. pub fn keccak_leaves_base( columns: &[u64], col_stride: usize, num_cols: usize, num_rows: usize, + rows_per_leaf: usize, ) -> Result> { assert!(num_rows.is_power_of_two()); + assert!(rows_per_leaf == 1 || rows_per_leaf == 2); + assert!( + num_rows >= rows_per_leaf, + "num_rows must be at least rows_per_leaf" + ); + assert!( + num_rows >= 2, + "num_rows must be at least 2 for bit-reversed GPU leaf hashing" + ); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" @@ -45,8 +57,13 @@ pub fn keccak_leaves_base( let be = backend()?; let stream = be.next_stream(); let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::(num_rows * 32)?; - launch_keccak_base( + let mut out_dev = stream.alloc_zeros::((num_rows / rows_per_leaf) * 32)?; + let launch = if rows_per_leaf == 2 { + launch_keccak_base_row_pair + } else { + launch_keccak_base + }; + launch( stream.as_ref(), &cols_dev, col_stride as u64, @@ -60,14 +77,25 @@ pub fn keccak_leaves_base( } /// Ext3 variant. Columns interleaved as three base slabs per ext3 column. -/// `columns.len() >= num_cols * 3 * col_stride`. +/// `columns.len() >= num_cols * 3 * col_stride`. `rows_per_leaf` as in +/// [`keccak_leaves_base`]. pub fn keccak_leaves_ext3( columns: &[u64], col_stride: usize, num_cols: usize, num_rows: usize, + rows_per_leaf: usize, ) -> Result> { assert!(num_rows.is_power_of_two()); + assert!(rows_per_leaf == 1 || rows_per_leaf == 2); + assert!( + num_rows >= rows_per_leaf, + "num_rows must be at least rows_per_leaf" + ); + assert!( + num_rows >= 2, + "num_rows must be at least 2 for bit-reversed GPU leaf hashing" + ); assert!( col_stride >= num_rows, "col_stride must be >= num_rows to keep per-column reads in-bounds" @@ -80,8 +108,13 @@ pub fn keccak_leaves_ext3( let be = backend()?; let stream = be.next_stream(); let cols_dev = stream.clone_htod(&columns[..total])?; - let mut out_dev = stream.alloc_zeros::(num_rows * 32)?; - launch_keccak_ext3( + let mut out_dev = stream.alloc_zeros::((num_rows / rows_per_leaf) * 32)?; + let launch = if rows_per_leaf == 2 { + launch_keccak_ext3_row_pair + } else { + launch_keccak_ext3 + }; + launch( stream.as_ref(), &cols_dev, col_stride as u64, @@ -170,6 +203,72 @@ pub(crate) fn launch_keccak_base( Ok(()) } +/// Row-pair base-field leaf hashing: leaf `i` hashes bit-reversed rows `2i`, +/// `2i+1` (one Merkle path per FRI query). Writes `num_rows/2` leaves of 32 +/// bytes into `out_dev`. Base-field analog of the comp-poly ext3 path; matches +/// the CPU `keccak_leaves_row_pair_bit_reversed`. +pub(crate) fn launch_keccak_base_row_pair( + stream: &CudaStream, + cols_dev: &CudaSlice, + col_stride: u64, + num_cols: u64, + num_rows: u64, + out_dev: &mut CudaViewMut<'_, u8>, +) -> Result<()> { + debug_assert!( + num_rows >= 2, + "keccak row-pair leaf kernel: num_rows must be >= 2" + ); + let be = backend()?; + let log_num_rows = num_rows.trailing_zeros() as u64; + // One thread per leaf (= row pair). + let cfg = keccak_launch_cfg(num_rows >> 1); + unsafe { + stream + .launch_builder(&be.keccak256_leaves_base_row_pair_batched) + .arg(cols_dev) + .arg(&col_stride) + .arg(&num_cols) + .arg(&num_rows) + .arg(&log_num_rows) + .arg(out_dev) + .launch(cfg)?; + } + Ok(()) +} + +/// Row-pair ext3 leaf hashing for the aux trace: reuses the comp-poly kernel +/// (`keccak_comp_poly_leaves_ext3`), which hashes bit-reversed rows `2i`, `2i+1` +/// across all ext3 columns. Writes `num_rows/2` leaves of 32 bytes. +pub(crate) fn launch_keccak_ext3_row_pair( + stream: &CudaStream, + cols_dev: &CudaSlice, + col_stride: u64, + num_cols: u64, + num_rows: u64, + out_dev: &mut CudaViewMut<'_, u8>, +) -> Result<()> { + debug_assert!( + num_rows >= 2, + "keccak row-pair leaf kernel: num_rows must be >= 2" + ); + let be = backend()?; + let log_num_rows = num_rows.trailing_zeros() as u64; + let cfg = keccak_launch_cfg(num_rows >> 1); + unsafe { + stream + .launch_builder(&be.keccak_comp_poly_leaves_ext3) + .arg(cols_dev) + .arg(&col_stride) + .arg(&num_cols) + .arg(&num_rows) + .arg(&log_num_rows) + .arg(out_dev) + .launch(cfg)?; + } + Ok(()) +} + /// Given `hashed_leaves` of length `leaves_len * 32`, build the full Merkle /// tree on device and return the complete node buffer `(2*leaves_len - 1) * /// 32` bytes in the standard layout: diff --git a/crypto/math-cuda/tests/comp_poly_tree.rs b/crypto/math-cuda/tests/comp_poly_tree.rs index 29e33b6fe..51b826dd1 100644 --- a/crypto/math-cuda/tests/comp_poly_tree.rs +++ b/crypto/math-cuda/tests/comp_poly_tree.rs @@ -1,6 +1,6 @@ //! Parity: GPU fused `evaluate_poly_coset_batch_ext3_into_with_merkle_tree` //! (LDE + row-pair Keccak leaves + Merkle inner tree) against the same CPU -//! pipeline produced by `commit_composition_polynomial`. +//! row-pair commitment layout used by `commit_bit_reversed(.., 2)`. use math::field::element::FieldElement; use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; @@ -83,7 +83,7 @@ fn cpu_hash_pair(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { out } -/// CPU: `commit_composition_polynomial`-style tree root over num_rows/2 leaves. +/// CPU: `commit_bit_reversed(.., 2)`-style tree root over num_rows/2 leaves. fn cpu_tree_nodes(parts: &[Vec]) -> Vec<[u8; 32]> { let num_rows = parts[0].len(); let num_parts = parts.len(); diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs index d614e233d..61a861f32 100644 --- a/crypto/math-cuda/tests/keccak_leaves.rs +++ b/crypto/math-cuda/tests/keccak_leaves.rs @@ -38,7 +38,7 @@ fn keccak_leaves_base_matches_cpu() { flat[c * n + r] = *e.value(); } } - let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 1).unwrap(); assert_eq!(gpu.len(), n * 32); for i in 0..n { assert_eq!( @@ -84,7 +84,7 @@ fn keccak_leaves_ext3_matches_cpu() { flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); } } - let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n).unwrap(); + let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 1).unwrap(); assert_eq!(gpu.len(), n * 32); for i in 0..n { assert_eq!( @@ -97,6 +97,87 @@ fn keccak_leaves_ext3_matches_cpu() { } } +#[test] +fn keccak_leaves_base_row_pair_matches_cpu() { + // Row-pair (trace) commit: leaf `i` hashes bit-reversed rows `2i`, `2i+1`. + // GPU `keccak_leaves_base(.., rows_per_leaf=2)` must match the CPU prover + // helper `keccak_leaves_row_pair_bit_reversed` over base columns. + for log_n in [4u32, 6, 8, 10, 12] { + for num_cols in [1usize, 5, 17, 41] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(500 + log_n as u64 + num_cols as u64); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| Fp::from_raw(rng.r#gen::())).collect()) + .collect(); + + let cpu = keccak_leaves_row_pair_bit_reversed(&columns); + assert_eq!(cpu.len(), n / 2); + + let mut flat = vec![0u64; num_cols * n]; + for (c, col) in columns.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + flat[c * n + r] = *e.value(); + } + } + let gpu = math_cuda::merkle::keccak_leaves_base(&flat, n, num_cols, n, 2).unwrap(); + assert_eq!(gpu.len(), (n / 2) * 32); + for i in 0..n / 2 { + assert_eq!( + &gpu[i * 32..(i + 1) * 32], + &cpu[i][..], + "base row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})" + ); + } + } + } +} + +#[test] +fn keccak_leaves_ext3_row_pair_matches_cpu() { + for log_n in [4u32, 6, 8, 10] { + for num_cols in [1usize, 3, 11, 20] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(600 + log_n as u64 + num_cols as u64); + let columns: Vec> = (0..num_cols) + .map(|_| { + (0..n) + .map(|_| { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) + }) + .collect() + }) + .collect(); + + let cpu = keccak_leaves_row_pair_bit_reversed(&columns); + assert_eq!(cpu.len(), n / 2); + + // De-interleaved 3-slab layout per ext3 column (same as the 1-row + // ext3 leaf path): [col*3+k] each a contiguous slab of n u64s. + let mut flat = vec![0u64; num_cols * 3 * n]; + for (c, col) in columns.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + flat[(c * 3) * n + r] = *e.value()[0].value(); + flat[(c * 3 + 1) * n + r] = *e.value()[1].value(); + flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); + } + } + let gpu = math_cuda::merkle::keccak_leaves_ext3(&flat, n, num_cols, n, 2).unwrap(); + assert_eq!(gpu.len(), (n / 2) * 32); + for i in 0..n / 2 { + assert_eq!( + &gpu[i * 32..(i + 1) * 32], + &cpu[i][..], + "ext3 row-pair leaf mismatch at i={i} (log_n={log_n}, cols={num_cols})" + ); + } + } + } +} + #[test] fn keccak_comp_poly_leaves_matches_cpu() { // Built tree's leaves live at byte offset `(num_leaves - 1) * 32` and diff --git a/crypto/stark/src/commitment.rs b/crypto/stark/src/commitment.rs new file mode 100644 index 000000000..d4a6dbdbe --- /dev/null +++ b/crypto/stark/src/commitment.rs @@ -0,0 +1,155 @@ +//! Merkle-tree commitment to bit-reversed, column-major LDE evaluations. +//! +//! This is the commitment layer the prover uses for the main/aux trace LDEs and +//! the composition-polynomial parts. It is decoupled from `IsStarkProver`: the +//! prover only orchestrates *when* to commit; the *how* (leaf layout, bit-reverse +//! permutation, Keccak hashing, tree build) lives here. +//! +//! ## Leaf layout +//! +//! For each leaf `i` we hash `rows_per_leaf` consecutive (bit-reversed) rows, +//! big-endian-concatenated column-by-column: +//! +//! ```text +//! leaf(i) = keccak( col_0[br(R·i)]‖col_1[br(R·i)]‖… ‖ col_0[br(R·i+1)]‖… ‖ … ) +//! where R = rows_per_leaf and br(j) = reverse_index(j, num_rows) +//! ``` +//! +//! - `rows_per_leaf == 2` (`ROWS_PER_LEAF`): a row pair per leaf (leaf `i` hashes +//! rows `2i` and `2i+1`). Used by BOTH the main/aux trace LDE and the +//! composition-polynomial parts: a FRI query opens a value and its symmetric +//! counterpart — exactly this pair — so one Merkle path authenticates both. +//! - `rows_per_leaf == 1`: one row per leaf. No longer used by the prover; kept +//! only so the GPU parity tests can compare against the per-row code path. +//! +//! The field-element serialization (`write_bytes_be`) + `hash_bytes` path is kept +//! exactly as before. + +use math::fft::bit_reversing::reverse_index; +use math::field::element::FieldElement; +use math::field::traits::IsField; +use math::traits::{AsBytes, ByteConversion}; + +#[cfg(feature = "parallel")] +use rayon::prelude::{IntoParallelIterator, ParallelIterator}; + +use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; + +/// Number of consecutive (bit-reversed) rows packed into one Merkle leaf for the +/// trace AND composition-polynomial commitments: the row-pair leaf the FRI +/// openings rely on (leaf `i` hashes rows `2i` and `2i+1`, so one Merkle path +/// authenticates both a value and its symmetric counterpart). +pub const ROWS_PER_LEAF: usize = 2; + +/// Computes the Keccak-256 leaf hashes for a bit-reversed, column-major commitment, +/// grouping `rows_per_leaf` consecutive bit-reversed rows into each leaf. +/// +/// Returns one `Commitment` per leaf (`columns[0].len() / rows_per_leaf` leaves), +/// or an empty `Vec` when there is nothing to hash. See the module docs for the +/// exact leaf byte layout. This is the single code path behind both the per-row +/// ([`keccak_leaves_bit_reversed`]) and per-row-pair +/// ([`keccak_leaves_row_pair_bit_reversed`]) commitments. +pub fn keccak_leaves_bit_reversed_grouped( + columns: &[Vec>], + rows_per_leaf: usize, +) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + if columns.is_empty() || columns[0].is_empty() { + return Vec::new(); + } + + let num_rows = columns[0].len(); + let byte_len = as ByteConversion>::BYTE_LEN; + + debug_assert!( + num_rows.is_power_of_two(), + "num_rows must be a power of two for reverse_index" + ); + debug_assert!( + rows_per_leaf >= 1 && num_rows.is_multiple_of(rows_per_leaf), + "num_rows must be a multiple of rows_per_leaf" + ); + + let num_leaves = num_rows / rows_per_leaf; + let total_bytes = rows_per_leaf * columns.len() * byte_len; + + // Leaf `i`: the `rows_per_leaf` bit-reversed rows starting at `R·i`, each row + // written column-by-column in big-endian, then hashed once. + let hash_leaf = |buf: &mut [u8], leaf_idx: usize| -> Commitment { + let mut offset = 0; + for k in 0..rows_per_leaf { + let br = reverse_index(rows_per_leaf * leaf_idx + k, num_rows as u64); + for col in columns { + col[br].write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + } + BatchedMerkleTreeBackend::::hash_bytes(buf) + }; + + // Per-thread buffer reuse (map_init) avoids millions of small allocations. + #[cfg(feature = "parallel")] + let result: Vec = (0..num_leaves) + .into_par_iter() + .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i)) + .collect(); + + #[cfg(not(feature = "parallel"))] + let result: Vec = { + let mut buf = vec![0u8; total_bytes]; + (0..num_leaves).map(|i| hash_leaf(&mut buf, i)).collect() + }; + + result +} + +/// Per-row Keccak-256 leaf hashes (one leaf per bit-reversed row). Thin wrapper +/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 1`. +/// +/// The prover no longer commits per-row (trace and composition both use the +/// row-pair layout, `ROWS_PER_LEAF`); this stays a named public function only so +/// the GPU parity tests in dependent crates can compare the per-row code path. +pub fn keccak_leaves_bit_reversed(columns: &[Vec>]) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + keccak_leaves_bit_reversed_grouped(columns, 1) +} + +/// Per-row-pair Keccak-256 leaf hashes (leaf `i` hashes bit-reversed rows `2i`, +/// `2i+1`). Used for the composition-polynomial-parts commitment. Thin wrapper +/// over [`keccak_leaves_bit_reversed_grouped`] with `rows_per_leaf = 2`. +pub fn keccak_leaves_row_pair_bit_reversed(parts: &[Vec>]) -> Vec +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + keccak_leaves_bit_reversed_grouped(parts, 2) +} + +/// Builds the Merkle tree committing to `columns`' bit-reversed, column-major LDE +/// evaluations, grouping `rows_per_leaf` rows per leaf, and returns the tree and +/// its root. `None` when there is nothing to commit. +/// +/// Replaces the prover's former `commit_columns_bit_reversed` (`rows_per_leaf = 1`) +/// and `commit_composition_polynomial` (`rows_per_leaf = 2`). +pub fn commit_bit_reversed( + columns: &[Vec>], + rows_per_leaf: usize, +) -> Option<(BatchedMerkleTree, Commitment)> +where + E: IsField, + FieldElement: AsBytes + Sync + Send + ByteConversion, +{ + if columns.is_empty() || columns[0].is_empty() { + return None; + } + let hashed_leaves = keccak_leaves_bit_reversed_grouped(columns, rows_per_leaf); + let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; + let root = tree.root; + Some((tree, root)) +} diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 36756b40b..5f09cd3b2 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -269,17 +269,17 @@ fn restore_columns_on_err(columns: &mut [Vec>], n: u } } -/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `lde_size` leaves +/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `num_leaves` leaves /// and return the node `Vec` (length-initialised, contents undefined) together -/// with its node count `total_nodes` (`2 * lde_size - 1`). Returns `None` if -/// the layout would be invalid (`lde_size < 2` or `total_nodes * 32` overflows +/// with its node count `total_nodes` (`2 * num_leaves - 1`). Returns `None` if +/// the layout would be invalid (`num_leaves < 2` or `total_nodes * 32` overflows /// `usize`). The caller builds the `&mut [u8]` byte view of length /// `total_nodes * 32` and must overwrite every byte via the GPU D2H. -fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> { - if lde_size < 2 { +fn alloc_merkle_nodes(num_leaves: usize) -> Option<(Vec<[u8; 32]>, usize)> { + if num_leaves < 2 { return None; } - let total_nodes = 2usize.saturating_mul(lde_size).checked_sub(1)?; + let total_nodes = 2usize.saturating_mul(num_leaves).checked_sub(1)?; let _byte_len = total_nodes.checked_mul(32)?; let mut nodes: Vec<[u8; 32]> = Vec::with_capacity(total_nodes); // SAFETY: every byte will be overwritten via the GPU D2H before the @@ -471,7 +471,11 @@ where LayoutDispatch::Run { n, lde_size } => (n, lde_size), }; let num_columns = columns.len(); - let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?; + // Row-pair trace commit: one Merkle leaf per bit-reversed row pair, so the + // tree has `lde_size / ROWS_PER_LEAF` leaves (matches the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF)` and `verify_opening_pair`). + let num_leaves = lde_size / crate::commitment::ROWS_PER_LEAF; + let (mut nodes, total_nodes) = alloc_merkle_nodes(num_leaves)?; let node_byte_len = total_nodes .checked_mul(32) .expect("node byte length overflow"); @@ -528,7 +532,11 @@ where LayoutDispatch::Run { n, lde_size } => (n, lde_size), }; let num_columns = columns.len(); - let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?; + // Row-pair trace commit: one Merkle leaf per bit-reversed row pair, so the + // tree has `lde_size / ROWS_PER_LEAF` leaves (matches the CPU + // `commit_bit_reversed(.., ROWS_PER_LEAF)` and `verify_opening_pair`). + let num_leaves = lde_size / crate::commitment::ROWS_PER_LEAF; + let (mut nodes, total_nodes) = alloc_merkle_nodes(num_leaves)?; let node_byte_len = total_nodes .checked_mul(32) .expect("node byte length overflow"); @@ -733,8 +741,8 @@ where /// host-side ext3 LDE eval Vecs produced by /// [`try_evaluate_parts_on_lde_gpu_keep`] (or the CPU path). Uses the same /// row-pair leaf pattern as the CPU -/// `commit_composition_polynomial`: each leaf hashes 2 consecutive -/// bit-reversed rows. +/// `commit_bit_reversed` (composition-polynomial commit path): each leaf hashes +/// 2 consecutive bit-reversed rows. /// /// Returns `None` to fall through to the CPU path when the type or size /// conditions don't hold; returns `None` on a math-cuda `Err` so the caller diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs index 16ff95082..aa5cc5436 100644 --- a/crypto/stark/src/instruments.rs +++ b/crypto/stark/src/instruments.rs @@ -33,7 +33,7 @@ pub struct TableSubOps { pub constraints: Duration, /// decompose_and_extend_d2 pub comp_decompose: Duration, - /// commit_composition_polynomial + /// commit_bit_reversed (composition-polynomial commit step) pub comp_commit: Duration, /// Round 3: barycentric OOD evaluation pub ood: Duration, @@ -52,11 +52,11 @@ pub struct TableSubOps { pub struct Round1SubOps { /// Main trace: expand_columns_to_lde (LDE/FFT) pub main_lde: Duration, - /// Main trace: commit_columns_bit_reversed (Merkle) + /// Main trace: commit_bit_reversed (Merkle) pub main_merkle: Duration, /// Aux trace: expand_columns_to_lde (LDE/FFT) pub aux_lde: Duration, - /// Aux trace: commit_columns_bit_reversed (Merkle) + /// Aux trace: commit_bit_reversed (Merkle) pub aux_merkle: Duration, } diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs index e9f6a1cda..87236c5f9 100644 --- a/crypto/stark/src/lib.rs +++ b/crypto/stark/src/lib.rs @@ -5,6 +5,7 @@ compile_error!("the `disk-spill` feature requires memmap2, which does not compil #[cfg(feature = "debug-checks")] pub mod bus_debug; +pub mod commitment; pub mod constraints; pub mod context; pub mod debug; diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs index a20a452b6..051651fde 100644 --- a/crypto/stark/src/par.rs +++ b/crypto/stark/src/par.rs @@ -37,3 +37,55 @@ where (a(), b()) } } + +/// Map `f(i)` over `range` and collect into a `Vec`, preserving index order. +/// Parallel when `feature = "parallel"`, sequential otherwise. Rayon's +/// `collect()` is index-ordered, so the result is identical either way. +pub(crate) fn par_map_collect( + range: std::ops::Range, + f: impl Fn(usize) -> R + Sync + Send, +) -> Vec { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + range.into_par_iter().map(f).collect() + } + #[cfg(not(feature = "parallel"))] + { + range.map(f).collect() + } +} + +/// Run `f(&mut item)` for each element of `slice`. Parallel when +/// `feature = "parallel"`, sequential otherwise (ordering is irrelevant). +pub(crate) fn par_for_each_mut(slice: &mut [T], f: impl Fn(&mut T) + Sync + Send) { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + slice.par_iter_mut().for_each(f); + } + #[cfg(not(feature = "parallel"))] + { + slice.iter_mut().for_each(f); + } +} + +/// Run `f(&mut item)` for each element of `slice`, short-circuiting on the +/// first `Err`. Parallel when `feature = "parallel"`, sequential otherwise. +// Only called from `disk-spill`-gated paths; keep it available without warning +// when that feature is off. +#[cfg_attr(not(feature = "disk-spill"), allow(dead_code))] +pub(crate) fn par_try_for_each_mut( + slice: &mut [T], + f: impl Fn(&mut T) -> Result<(), E> + Sync + Send, +) -> Result<(), E> { + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + slice.par_iter_mut().try_for_each(f) + } + #[cfg(not(feature = "parallel"))] + { + slice.iter_mut().try_for_each(f) + } +} diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs index 1751d60fe..851c0b37a 100644 --- a/crypto/stark/src/proof/stark.rs +++ b/crypto/stark/src/proof/stark.rs @@ -10,9 +10,14 @@ use crate::{ #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[serde(bound = "")] +/// Opening of a bit-reversed, row-paired commitment at one FRI query. +/// +/// The queried row and its symmetric counterpart (LDE positions `2·iota`, +/// `2·iota+1`) are committed together as a single leaf at position `iota`, so one +/// Merkle `proof` authenticates both `evaluations` (the row) and +/// `evaluations_sym` (its symmetric). Same layout used for trace and composition. pub struct PolynomialOpenings { pub proof: Proof, - pub proof_sym: Proof, pub evaluations: Vec>, pub evaluations_sym: Vec>, } diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 4da57559c..ba3f870a1 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -11,7 +11,7 @@ use math::fft::errors::FFTError; use log::info; use math::field::traits::{IsField, IsSubFieldOf}; use math::spill_safe::SpillSafe; -use math::traits::{AsBytes, ByteConversion}; +use math::traits::AsBytes; use math::{ field::{element::FieldElement, traits::IsFFTField}, polynomial::Polynomial, @@ -33,7 +33,10 @@ use crate::storage_mode::StorageMode; use crate::table::Table; use crate::trace::LDETraceTable; -use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; +use super::config::{BatchedMerkleTree, Commitment}; +// `BatchedMerkleTreeBackend` is only named by the GPU leaf+tree builders. +#[cfg(feature = "cuda")] +use super::config::BatchedMerkleTreeBackend; use super::constraints::evaluator::ConstraintEvaluator; use super::domain::{Domain, DomainConstants}; use super::fri::fri_decommit::FriDecommitment; @@ -43,6 +46,8 @@ use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof}; use super::trace::TraceTable; use super::traits::AIR; +pub use crate::commitment::{keccak_leaves_bit_reversed, keccak_leaves_row_pair_bit_reversed}; + /// A triple of (AIR, TraceTable, PublicInputs) for proving. type AirTracePair<'a, Field, FieldExtension, PI> = ( &'a dyn AIR, @@ -87,6 +92,12 @@ pub enum ProvingError { DiskSpill(String), } +impl From for ProvingError { + fn from(e: FFTError) -> Self { + ProvingError::WrongParameter(format!("{e}")) + } +} + /// Commitment artifacts for one trace table (main or auxiliary). Used for both /// plain and preprocessed tables. Preprocessed tables additionally carry a /// separate Merkle tree over their precomputed columns, hence the optional @@ -346,7 +357,7 @@ where /// A container for the results of the third round of the STARK Prove protocol. pub(crate) struct Round3 { - /// Evaluations of the trace polynomials, main ans auxiliary, at the out-of-domain challenge. + /// Evaluations of the trace polynomials, main and auxiliary, at the out-of-domain challenge. trace_ood_evaluations: Table, /// Evaluations of the composition polynomial parts at the out-of-domain challenge. composition_poly_parts_ood_evaluation: Vec>, @@ -389,128 +400,6 @@ where } } -/// Compute Keccak-256 leaf hashes for `commit_columns_bit_reversed`: one -/// leaf per row, where each row is read at `reverse_index(row_idx)` and the -/// columns are concatenated as big-endian bytes before hashing. -/// -/// Returns `Vec` with the same length as `columns[0]`. Exposed -/// (instead of being a closure inside `commit_columns_bit_reversed`) so -/// parity tests in dependent crates can compare against the same code path -/// the prover uses. -pub fn keccak_leaves_bit_reversed(columns: &[Vec>]) -> Vec -where - E: IsField, - FieldElement: AsBytes + Sync + Send + ByteConversion, -{ - if columns.is_empty() || columns[0].is_empty() { - return Vec::new(); - } - - let num_rows = columns[0].len(); - let num_cols = columns.len(); - let byte_len = as ByteConversion>::BYTE_LEN; - - debug_assert!( - num_rows.is_power_of_two(), - "num_rows must be a power of two for reverse_index" - ); - - let total_bytes = num_cols * byte_len; - - let hash_leaf = |buf: &mut [u8], row_idx: usize| -> Commitment { - let br_idx = reverse_index(row_idx, num_rows as u64); - for col_idx in 0..num_cols { - columns[col_idx][br_idx] - .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]); - } - BatchedMerkleTreeBackend::::hash_bytes(buf) - }; - - #[cfg(feature = "parallel")] - let iter = (0..num_rows).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..num_rows; - - // Per-thread buffer reuse: map_init allocates one buffer per Rayon thread, - // eliminating millions of small heap allocations under parallel contention. - #[cfg(feature = "parallel")] - let result: Vec = iter - .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf(buf, i)) - .collect(); - - #[cfg(not(feature = "parallel"))] - let result: Vec = { - let mut buf = vec![0u8; total_bytes]; - iter.map(|i| hash_leaf(&mut buf, i)).collect() - }; - - result -} - -/// Compute Keccak-256 leaf hashes for `commit_composition_polynomial`: one -/// leaf per row-pair, where leaf `i` hashes the BE concatenation of -/// `parts[..][br_0] ++ parts[..][br_1]` with -/// `br_k = reverse_index(2*i + k, num_rows)`. -/// -/// Returns `Vec` of length `parts[0].len() / 2`. -pub fn keccak_leaves_row_pair_bit_reversed(parts: &[Vec>]) -> Vec -where - E: IsField, - FieldElement: AsBytes + Sync + Send + ByteConversion, -{ - let num_parts = parts.len(); - if num_parts == 0 { - return Vec::new(); - } - let num_rows = parts[0].len(); - if num_rows == 0 { - return Vec::new(); - } - - let num_leaves = num_rows / 2; - debug_assert!( - num_rows.is_power_of_two(), - "num_rows must be a power of two for reverse_index" - ); - - let byte_len = as ByteConversion>::BYTE_LEN; - - let total_bytes = 2 * num_parts * byte_len; - - let hash_leaf_pair = |buf: &mut [u8], leaf_idx: usize| -> Commitment { - let br_0 = reverse_index(2 * leaf_idx, num_rows as u64); - let br_1 = reverse_index(2 * leaf_idx + 1, num_rows as u64); - let mut offset = 0; - for part in parts.iter() { - part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]); - offset += byte_len; - } - for part in parts.iter() { - part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]); - offset += byte_len; - } - BatchedMerkleTreeBackend::::hash_bytes(buf) - }; - - #[cfg(feature = "parallel")] - let iter = (0..num_leaves).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..num_leaves; - - #[cfg(feature = "parallel")] - let result: Vec = iter - .map_init(|| vec![0u8; total_bytes], |buf, i| hash_leaf_pair(buf, i)) - .collect(); - - #[cfg(not(feature = "parallel"))] - let result: Vec = { - let mut buf = vec![0u8; total_bytes]; - iter.map(|i| hash_leaf_pair(&mut buf, i)).collect() - }; - - result -} - /// The functionality of a STARK prover providing methods to run the STARK Prove protocol /// https://lambdaclass.github.io/lambdaworks/starks/protocol.html /// The default implementation is complete and is compatible with Stone prover @@ -529,29 +418,6 @@ pub trait IsStarkProver< FieldElement: math::traits::ByteConversion, FieldElement: math::traits::ByteConversion, { - /// Builds a Merkle tree commitment from column-major LDE evaluations with - /// bit-reverse permutation, without cloning the full evaluation matrix. - /// - /// For each row index `i`, we hash `col_0[br(i)] || col_1[br(i)] || ...` - /// where `br(i)` is the bit-reversal of `i`. This produces the same Merkle - /// tree as the old clone + bit-reverse + columns2rows + batch_commit flow, - /// but avoids allocating the cloned and transposed matrices entirely. - fn commit_columns_bit_reversed( - columns: &[Vec>], - ) -> Option<(BatchedMerkleTree, Commitment)> - where - FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, - E: IsField, - { - if columns.is_empty() || columns[0].is_empty() { - return None; - } - let hashed_leaves = keccak_leaves_bit_reversed(columns); - let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; - let root = tree.root; - Some((tree, root)) - } - /// Compute the LDE commitment for a subset of columns from a trace (for testing). /// /// This helper computes the same commitment the prover generates internally, @@ -574,7 +440,8 @@ pub trait IsStarkProver< let twiddles = LdeTwiddles::new(&domain); let evals = Self::compute_lde_from_columns_cached::(&precomputed, &domain, &twiddles); - let (_, commitment) = Self::commit_columns_bit_reversed(&evals)?; + let (_, commitment) = + crate::commitment::commit_bit_reversed(&evals, crate::commitment::ROWS_PER_LEAF)?; Some(commitment) } @@ -596,23 +463,16 @@ pub trait IsStarkProver< return Vec::new(); } - #[cfg(not(feature = "parallel"))] - let columns_iter = columns.iter(); - #[cfg(feature = "parallel")] - let columns_iter = columns.par_iter(); - - columns_iter - .map(|col| { - Polynomial::coset_lde_full::( - col, - domain.blowup_factor, - &twiddles.coset_weights, - &twiddles.inv, - &twiddles.fwd, - ) - }) - .collect::>>, _>>() + crate::par::par_map_collect(0..columns.len(), |i| { + Polynomial::coset_lde_full::( + &columns[i], + domain.blowup_factor, + &twiddles.coset_weights, + &twiddles.inv, + &twiddles.fwd, + ) .expect("coset LDE computation") + }) } /// Expand each column in-place from N evaluations to N×blowup LDE evaluations. @@ -647,11 +507,7 @@ pub trait IsStarkProver< return; } - #[cfg(feature = "parallel")] - let iter = columns.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let iter = columns.iter_mut(); - iter.for_each(|buf| { + crate::par::par_for_each_mut(columns, |buf| { Polynomial::coset_lde_full_expand::( buf, domain.blowup_factor, @@ -726,37 +582,38 @@ pub trait IsStarkProver< let t_sub = Instant::now(); let commit = match precomputed { - None => { - #[allow(unused_mut)] - let (mut tree, root) = Self::commit_columns_bit_reversed(&columns) - .ok_or(ProvingError::EmptyCommitment)?; + None => Self::commit_plain::( + &columns, #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - tree.spill_nodes_to_disk() - .map_err(|e| ProvingError::DiskSpill(format!("main Merkle tree: {e}")))?; - } - TableCommit::plain(tree, root) - } + storage_mode, + #[cfg(feature = "disk-spill")] + "main Merkle tree", + )?, Some((expected_precomputed_root, num_cols)) => { #[allow(unused_mut)] let (mut precomputed_tree, precomputed_root) = - Self::commit_columns_bit_reversed(&columns[..num_cols]) - .ok_or(ProvingError::EmptyCommitment)?; + crate::commitment::commit_bit_reversed( + &columns[..num_cols], + crate::commitment::ROWS_PER_LEAF, + ) + .ok_or(ProvingError::EmptyCommitment)?; #[allow(unused_mut)] - let (mut mult_tree, mult_root) = - Self::commit_columns_bit_reversed(&columns[num_cols..]) - .ok_or(ProvingError::EmptyCommitment)?; + let (mut mult_tree, mult_root) = crate::commitment::commit_bit_reversed( + &columns[num_cols..], + crate::commitment::ROWS_PER_LEAF, + ) + .ok_or(ProvingError::EmptyCommitment)?; if precomputed_root != expected_precomputed_root { return Err(ProvingError::PrecomputedCommitmentMismatch); } #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - precomputed_tree.spill_nodes_to_disk().map_err(|e| { - ProvingError::DiskSpill(format!("precomputed Merkle tree: {e}")) - })?; - mult_tree - .spill_nodes_to_disk() - .map_err(|e| ProvingError::DiskSpill(format!("mult Merkle tree: {e}")))?; + { + Self::spill_tree( + &mut precomputed_tree, + storage_mode, + "precomputed Merkle tree", + )?; + Self::spill_tree(&mut mult_tree, storage_mode, "mult Merkle tree")?; } TableCommit::preprocessed( mult_tree, @@ -777,6 +634,48 @@ pub trait IsStarkProver< Ok((commit, columns)) } + /// Spill a committed Merkle tree to disk when `storage_mode` is `Disk`, + /// tagging any I/O error with `label`. No-op otherwise. Shared by every commit + /// site (main / preprocessed split / aux). + #[cfg(feature = "disk-spill")] + fn spill_tree( + tree: &mut BatchedMerkleTree, + storage_mode: StorageMode, + label: &str, + ) -> Result<(), ProvingError> + where + C: IsField, + FieldElement: AsBytes + Sync + Send, + { + if storage_mode == StorageMode::Disk { + tree.spill_nodes_to_disk() + .map_err(|e| ProvingError::DiskSpill(format!("{label}: {e}")))?; + } + Ok(()) + } + + /// Commit an already-LDE-expanded plain column set: build the row-paired + /// Merkle tree, spill it to disk if requested, and wrap it as a plain + /// `TableCommit`. Shared by the main-trace (non-preprocessed) and aux-trace + /// commit paths. + fn commit_plain( + columns: &[Vec>], + #[cfg(feature = "disk-spill")] storage_mode: StorageMode, + #[cfg(feature = "disk-spill")] spill_label: &str, + ) -> Result, ProvingError> + where + C: IsField, + FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, + { + #[allow(unused_mut)] + let (mut tree, root) = + crate::commitment::commit_bit_reversed(columns, crate::commitment::ROWS_PER_LEAF) + .ok_or(ProvingError::EmptyCommitment)?; + #[cfg(feature = "disk-spill")] + Self::spill_tree(&mut tree, storage_mode, spill_label)?; + Ok(TableCommit::plain(tree, root)) + } + /// Recompute Round1 from the trace, reusing the Merkle trees stored in commitments. /// /// Only used by `run_debug_checks` — Phase D consumes the cached LDE @@ -866,30 +765,6 @@ pub trait IsStarkProver< } } - /// Returns the Merkle tree and the commitment to the evaluations of the parts of the - /// composition polynomial. - fn commit_composition_polynomial( - lde_composition_poly_parts_evaluations: &[Vec>], - ) -> Option<(BatchedMerkleTree, Commitment)> - where - FieldElement: AsBytes + Sync + Send, - FieldElement: AsBytes + Sync + Send + math::traits::ByteConversion, - { - let num_parts = lde_composition_poly_parts_evaluations.len(); - if num_parts == 0 { - return None; - } - let num_rows = lde_composition_poly_parts_evaluations[0].len(); - if num_rows == 0 { - return None; - } - let hashed_leaves = - keccak_leaves_row_pair_bit_reversed(lde_composition_poly_parts_evaluations); - let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; - let root = tree.root; - Some((tree, root)) - } - /// Algebraically decompose H(x) = H₀(x²) + x·H₁(x²) on the LDE coset, then /// extend each half to the full LDE domain. This replaces the expensive /// iFFT(2N) + break_in_parts + FFT(2N)×2 pipeline with: @@ -1033,11 +908,10 @@ pub trait IsStarkProver< } else { // Fallback for any future AIR with d > 2. let composition_poly = - Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset) - .unwrap(); + Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)?; let composition_poly_parts = composition_poly.break_in_parts(number_of_parts); - let cpu_eval = || -> Vec>> { + let cpu_eval = || -> Result>>, ProvingError> { composition_poly_parts .iter() .map(|part| { @@ -1047,7 +921,7 @@ pub trait IsStarkProver< domain.interpolation_domain_size, &domain.coset_offset, ) - .unwrap() + .map_err(ProvingError::from) }) .collect() }; @@ -1072,11 +946,11 @@ pub trait IsStarkProver< gpu_composition_parts = Some(handle); evals } - None => cpu_eval(), + None => cpu_eval()?, } } #[cfg(not(feature = "cuda"))] - cpu_eval() + cpu_eval()? }; #[cfg(feature = "instruments")] let fft_dur = t_sub.elapsed(); @@ -1098,8 +972,11 @@ pub trait IsStarkProver< let root = tree.root; (tree, root) } - None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) - .ok_or(ProvingError::EmptyCommitment)?, + None => crate::commitment::commit_bit_reversed( + &lde_composition_poly_parts_evaluations, + crate::commitment::ROWS_PER_LEAF, + ) + .ok_or(ProvingError::EmptyCommitment)?, }; #[cfg(feature = "instruments")] let merkle_dur = t_sub.elapsed(); @@ -1464,12 +1341,7 @@ pub trait IsStarkProver< }) .collect(); - #[cfg(feature = "parallel")] - let iter = (0..lde_size).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = 0..lde_size; - - iter.map(|i| { + crate::par::par_map_collect(0..lde_size, |i| { let mut result = FieldElement::::zero(); // H terms @@ -1495,7 +1367,6 @@ pub trait IsStarkProver< result }) - .collect() } /// Computes values and validity proofs of the evaluations of the composition polynomial parts @@ -1512,7 +1383,7 @@ pub trait IsStarkProver< { let proof = composition_poly_merkle_tree .get_proof_by_pos(index) - .unwrap(); + .expect("FRI query index in bounds"); let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations .iter() @@ -1525,8 +1396,7 @@ pub trait IsStarkProver< .collect(); PolynomialOpenings { - proof: proof.clone(), - proof_sym: proof, + proof, evaluations: lde_composition_poly_parts_evaluation .clone() .into_iter() @@ -1556,13 +1426,15 @@ pub trait IsStarkProver< G: Fn(usize) -> Vec>, { let domain_size = domain.lde_roots_of_unity_coset.len() as u64; - let index = challenge * 2; - let index_sym = challenge * 2 + 1; + // Rows `2·challenge` and `2·challenge+1` are committed together as the + // single leaf at position `challenge`; one Merkle path authenticates both + // the queried row and its symmetric counterpart. PolynomialOpenings { - proof: tree.get_proof_by_pos(index).unwrap(), - proof_sym: tree.get_proof_by_pos(index_sym).unwrap(), - evaluations: gather(reverse_index(index, domain_size)), - evaluations_sym: gather(reverse_index(index_sym, domain_size)), + proof: tree + .get_proof_by_pos(challenge) + .expect("FRI query index in bounds"), + evaluations: gather(reverse_index(challenge * 2, domain_size)), + evaluations_sym: gather(reverse_index(challenge * 2 + 1, domain_size)), } } @@ -1628,7 +1500,7 @@ pub trait IsStarkProver< openings } - // TODO: propagate errors instead of unwrap() in commit_columns, reconstruct_round1, and expand_columns_to_lde + // TODO: propagate errors instead of unwrap() in commit_main_trace, reconstruct_round1, and expand_columns_to_lde /// Generates STARK proofs for one or more AIRs with a shared transcript. /// /// # Multi-Table Proving with LogUp @@ -1727,11 +1599,7 @@ pub trait IsStarkProver< // Spill main traces to mmap before Round 1 LDE. #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { - #[cfg(feature = "parallel")] - let spill_iter = air_trace_pairs.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let mut spill_iter = air_trace_pairs.iter_mut(); - spill_iter.try_for_each(|(_, trace, _)| { + crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(_, trace, _)| { trace .main_table .spill_to_disk() @@ -1768,13 +1636,8 @@ pub trait IsStarkProver< let chunk_end = (chunk_start + k).min(num_airs); let chunk_range = chunk_start..chunk_end; - #[cfg(feature = "parallel")] - let iter = chunk_range.into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = chunk_range; - - let chunk_results: Vec> = iter - .map(|idx| { + let chunk_results: Vec> = + crate::par::par_map_collect(chunk_range, |idx| { let (air, trace, _) = &air_trace_pairs[idx]; let domain = &domains[idx]; let twiddles = &twiddle_caches[idx]; @@ -1790,8 +1653,7 @@ pub trait IsStarkProver< #[cfg(feature = "disk-spill")] storage_mode, ) - }) - .collect(); + }); // Sequential: append roots to shared transcript (Fiat-Shamir ordering) for result in chunk_results { @@ -1863,17 +1725,13 @@ pub trait IsStarkProver< // Spill all aux trace tables to mmap before any Round 1 aux LDE work. #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { - #[cfg(feature = "parallel")] - let spill_iter = air_trace_pairs.par_iter_mut(); - #[cfg(not(feature = "parallel"))] - let mut spill_iter = air_trace_pairs.iter_mut(); - spill_iter.try_for_each(|(air, trace, _)| { + crate::par::par_try_for_each_mut(&mut air_trace_pairs, |(air, trace, _)| { if air.has_aux_trace() { trace .spill_aux_to_disk() .map_err(|e| ProvingError::DiskSpill(format!("aux trace: {e}")))?; } - Ok(()) + Ok::<(), ProvingError>(()) })?; } @@ -1919,14 +1777,9 @@ pub trait IsStarkProver< let chunk_end = (chunk_start + k).min(num_airs); let chunk_range = chunk_start..chunk_end; - #[cfg(feature = "parallel")] - let iter = chunk_range.into_par_iter(); - #[cfg(not(feature = "parallel"))] - let iter = chunk_range; - #[allow(clippy::type_complexity)] - let chunk_aux: Vec, ProvingError>> = iter - .map(|idx| { + let chunk_aux: Vec, ProvingError>> = + crate::par::par_map_collect(chunk_range, |idx| { let (air, trace, _) = &air_trace_pairs[idx]; let domain = &domains[idx]; let twiddles = &twiddle_caches[idx]; @@ -1983,30 +1836,27 @@ pub trait IsStarkProver< let aux_lde_dur = t_sub.elapsed(); #[cfg(feature = "instruments")] let t_sub = Instant::now(); - #[allow(unused_mut)] - let (mut tree, root) = Self::commit_columns_bit_reversed(&columns) - .ok_or(ProvingError::EmptyCommitment)?; + let commit = Self::commit_plain::( + &columns, + #[cfg(feature = "disk-spill")] + storage_mode, + #[cfg(feature = "disk-spill")] + "aux Merkle tree", + )?; #[cfg(feature = "instruments")] crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed()); - #[cfg(feature = "disk-spill")] - if storage_mode == StorageMode::Disk { - tree.spill_nodes_to_disk().map_err(|e| { - ProvingError::DiskSpill(format!("aux Merkle tree: {e}")) - })?; - } #[cfg(feature = "cuda")] - return Ok((Some(TableCommit::plain(tree, root)), columns, None)); + return Ok((Some(commit), columns, None)); #[cfg(not(feature = "cuda"))] - Ok((Some(TableCommit::plain(tree, root)), columns)) + Ok((Some(commit), columns)) } else { #[cfg(feature = "cuda")] return Ok((None, Vec::new(), None)); #[cfg(not(feature = "cuda"))] Ok((None, Vec::new())) } - }) - .collect(); + }); // Sequential: append aux roots to forked transcripts. for (j, result) in chunk_aux.into_iter().enumerate() { @@ -2221,7 +2071,7 @@ pub trait IsStarkProver< // TODO: propagate errors instead of unwrap() in open_deep_composition_poly and FRI operations /// Executes rounds 2-4 and generates a STARK proof for the trace `main_trace` with public inputs `pub_inputs`. - /// Warning: the transcript must be safely initializated before passing it to this method. + /// Warning: the transcript must be safely initialized before passing it to this method. fn prove_rounds_2_to_4( air: &dyn AIR, pub_inputs: &PI, @@ -2234,7 +2084,7 @@ pub trait IsStarkProver< FieldElement: AsBytes, PI: Send + Sync + Clone, { - info!("Started proof generation..."); + log::debug!("Started proof generation..."); // =================================== // ==========| Round 2 |========== @@ -2347,7 +2197,7 @@ pub trait IsStarkProver< }); } - info!("End proof generation"); + log::debug!("End proof generation"); Ok(StarkProof { // [t] diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 68819c76b..03119f617 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -301,25 +301,11 @@ pub trait IsStarkVerifier< domain.lde_coset_element(reverse_index(raw, domain.lde_length as u64)) } - /// Verifies the validity of the opening proof. - fn verify_opening( - proof: &Proof, - root: &Commitment, - index: usize, - value: &[FieldElement], - ) -> bool - where - FieldElement: AsBytes + Sync + Send, - FieldElement: AsBytes + Sync + Send, - E: IsField, - Field: IsSubFieldOf, - { - proof.verify::>(root, index, &value.to_owned()) - } - - /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings - /// of a `PolynomialOpenings` against the given `root` at iota positions - /// `iota*2` and `iota*2 + 1`. + /// Verify a row-paired `PolynomialOpenings` against `root`. The row pair + /// (`2·iota`, `2·iota+1`) is committed as the single leaf at position `iota`, + /// so one Merkle path authenticates both rows: reconstruct the leaf from + /// `evaluations ‖ evaluations_sym` and verify once. (Same as the composition + /// opening check.) fn verify_opening_pair( opening: &PolynomialOpenings, root: &Commitment, @@ -331,13 +317,11 @@ pub trait IsStarkVerifier< E: IsField, Field: IsSubFieldOf, { - Self::verify_opening::(&opening.proof, root, iota * 2, &opening.evaluations) - && Self::verify_opening::( - &opening.proof_sym, - root, - iota * 2 + 1, - &opening.evaluations_sym, - ) + let mut value = opening.evaluations.clone(); + value.extend_from_slice(&opening.evaluations_sym); + opening + .proof + .verify::>(root, iota, &value) } /// Verify opening Open(tⱼ(D_LDE), 𝜐) and Open(tⱼ(D_LDE), -𝜐) for all trace polynomials tⱼ, diff --git a/prover/src/tables/bitwise.rs b/prover/src/tables/bitwise.rs index 468e2a5b2..c4871765f 100644 --- a/prover/src/tables/bitwise.rs +++ b/prover/src/tables/bitwise.rs @@ -25,13 +25,13 @@ //! All lookups are provided as receivers with negative multiplicity, //! meaning other tables send to this table. -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; #[cfg(feature = "parallel")] use rayon::prelude::*; @@ -195,19 +195,19 @@ pub const fn is_preprocessed() -> bool { fn static_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xfb, 0x46, 0xff, 0x1c, 0xed, 0x4c, 0x97, 0xfb, 0xb2, 0x17, 0x55, 0x24, 0x08, 0x04, - 0x15, 0xee, 0xbe, 0xa6, 0xee, 0x86, 0x69, 0xaf, 0x3a, 0x4f, 0x9e, 0x2a, 0x44, 0x81, - 0xf9, 0xb0, 0xf3, 0xff, + 0xfa, 0x3e, 0xcf, 0x80, 0xfd, 0x95, 0xe5, 0x09, 0x74, 0xd4, 0x55, 0x23, 0xf6, 0x42, + 0xb6, 0x4b, 0x05, 0xc4, 0xf9, 0x66, 0xc2, 0x4d, 0xff, 0xda, 0x31, 0x47, 0xab, 0x7b, + 0x0c, 0x6d, 0xc4, 0xcf, ]), 4 => Some([ - 0xb5, 0xc4, 0xc0, 0x80, 0x03, 0x5b, 0xb6, 0x12, 0x78, 0x8c, 0x4d, 0xd4, 0x9e, 0x3d, - 0xc4, 0xe2, 0xef, 0x95, 0xf0, 0xbf, 0xe8, 0x1d, 0x98, 0xec, 0x7f, 0x58, 0x3a, 0x47, - 0x18, 0x03, 0x7e, 0xa5, + 0xff, 0x76, 0x8e, 0x85, 0x4b, 0xdc, 0x32, 0x61, 0x96, 0x16, 0x15, 0x19, 0x73, 0x70, + 0xf0, 0x64, 0x81, 0xfd, 0x4f, 0x5c, 0xbd, 0x9c, 0x30, 0x26, 0xd5, 0xc0, 0x81, 0xf3, + 0xce, 0x38, 0x50, 0x3e, ]), 8 => Some([ - 0x8a, 0x18, 0x70, 0x51, 0x34, 0x1a, 0x65, 0xaa, 0x79, 0x17, 0x07, 0x9a, 0xf3, 0x0b, - 0xcb, 0xd0, 0x7c, 0xe3, 0x2a, 0xce, 0x89, 0x9a, 0xfd, 0xc8, 0x0d, 0x6b, 0x48, 0x43, - 0x83, 0x5d, 0x18, 0xb8, + 0x0e, 0x1b, 0xc1, 0x0d, 0xae, 0x64, 0xe7, 0xca, 0xe0, 0x2a, 0x3b, 0xab, 0xd7, 0xd2, + 0xbb, 0x80, 0xd5, 0x24, 0x5a, 0xce, 0x25, 0xb6, 0x84, 0x77, 0x9c, 0xb5, 0xeb, 0x67, + 0x61, 0x82, 0x78, 0x3d, ]), _ => None, } @@ -283,7 +283,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { let coset_offset = FE::from(options.coset_offset); #[cfg(feature = "parallel")] - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .par_iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -292,7 +292,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { .collect(); #[cfg(not(feature = "parallel"))] - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -300,25 +300,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { }) .collect(); - // Step 4: Bit-reverse permute (parallel) - #[cfg(feature = "parallel")] - lde_columns.par_iter_mut().for_each(|col| { - in_place_bit_reverse_permute(col); - }); - - #[cfg(not(feature = "parallel"))] - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Step 5: Convert columns to rows for Merkle tree - let lde_rows = columns2rows(lde_columns); - - // Step 6: Build Merkle tree over LDE (N * blowup leaves) - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for bitwise LDE"); - - tree.root + root } /// Returns the preprocessed commitment for the bitwise table. diff --git a/prover/src/tables/decode.rs b/prover/src/tables/decode.rs index 6cef6a482..a62fe8c69 100644 --- a/prover/src/tables/decode.rs +++ b/prover/src/tables/decode.rs @@ -34,13 +34,13 @@ use executor::elf::Elf; use executor::vm::instruction::decoding::{Instruction, InstructionError}; use executor::vm::memory::U64HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable}; @@ -286,7 +286,7 @@ pub fn compute_precomputed_commitment( // Step 4: Evaluate polynomials on LDE domain (N * blowup_factor points) let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -294,19 +294,9 @@ pub fn compute_precomputed_commitment( }) .collect(); - // Step 5: Bit-reverse permute (same as prover) - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Step 6: Convert columns to rows for Merkle tree - let lde_rows = columns2rows(lde_columns); - - // Step 7: Build Merkle tree over LDE (N * blowup leaves) - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for decode LDE"); - - tree.root + root } // ========================================================================= diff --git a/prover/src/tables/keccak_rc.rs b/prover/src/tables/keccak_rc.rs index 3575c8ba1..f9f0d1cc4 100644 --- a/prover/src/tables/keccak_rc.rs +++ b/prover/src/tables/keccak_rc.rs @@ -8,13 +8,13 @@ //! committed via a static lookup table (with recompute as fallback for //! `ProofOptions` not covered by the static table). -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use executor::vm::instruction::execution::KECCAK_RC; @@ -97,19 +97,19 @@ pub const fn generate_row(round: usize) -> [u64; NUM_PRECOMPUTED_COLS] { fn static_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xe8, 0x06, 0x8b, 0xb2, 0xbd, 0x3d, 0x80, 0xf3, 0x92, 0x95, 0x31, 0x1a, 0xfd, 0x55, - 0xba, 0x12, 0x3f, 0x76, 0xeb, 0x44, 0x32, 0x57, 0x9d, 0xb7, 0x7f, 0x1e, 0x63, 0xb4, - 0x98, 0xb5, 0xb0, 0xb7, + 0xab, 0x7a, 0xad, 0xf5, 0xbf, 0xa2, 0xd5, 0x5c, 0x29, 0x83, 0x83, 0xe6, 0x2e, 0x47, + 0xa0, 0xa5, 0x22, 0xf9, 0x57, 0x89, 0x5a, 0x5c, 0xbb, 0x1f, 0x34, 0xbc, 0x21, 0x72, + 0xa9, 0x2c, 0x85, 0xe3, ]), 4 => Some([ - 0xa9, 0xfb, 0xc9, 0x15, 0x1c, 0x22, 0x75, 0xe7, 0x56, 0xeb, 0x6d, 0xf9, 0xfe, 0x83, - 0x2a, 0xb1, 0xa7, 0x1a, 0x20, 0x71, 0x9b, 0x0c, 0xff, 0x6b, 0x3f, 0x57, 0xc6, 0x84, - 0x3e, 0xbf, 0xc8, 0xaa, + 0xfb, 0x42, 0x58, 0x76, 0xf4, 0x30, 0x98, 0x04, 0xef, 0x8c, 0x4e, 0x65, 0xf7, 0x1a, + 0x29, 0x03, 0xd2, 0xc6, 0x12, 0x0d, 0x18, 0xe1, 0x28, 0x6e, 0x70, 0xeb, 0xa8, 0x11, + 0x3c, 0x3e, 0xe1, 0xdd, ]), 8 => Some([ - 0x5c, 0x30, 0xf6, 0xa0, 0xcf, 0x78, 0x43, 0x15, 0x5b, 0x5d, 0x18, 0x34, 0x44, 0xba, - 0x81, 0x9a, 0x64, 0x05, 0x5c, 0x79, 0x26, 0x18, 0x09, 0x24, 0x6b, 0xa2, 0x3f, 0x5f, - 0x77, 0x09, 0xd5, 0xfc, + 0x9f, 0x91, 0xaf, 0xb6, 0x5b, 0x75, 0x1e, 0xfb, 0x73, 0x93, 0x2c, 0xc4, 0xa8, 0xe1, + 0xb5, 0x21, 0x91, 0x5d, 0x6a, 0x19, 0x2e, 0x1d, 0xa8, 0x80, 0x21, 0x1f, 0x36, 0x76, + 0x9b, 0x8e, 0x3d, 0xb6, ]), _ => None, } @@ -144,7 +144,7 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { // Evaluate on LDE domain let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, NUM_ROWS, &coset_offset) @@ -152,17 +152,9 @@ pub fn compute_preprocessed_commitment(options: &ProofOptions) -> Commitment { }) .collect(); - // Bit-reverse permute - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - // Build Merkle tree - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for keccak_rc LDE"); - - tree.root + root } /// Returns the preprocessed commitment for the keccak_rc table. diff --git a/prover/src/tables/page.rs b/prover/src/tables/page.rs index 174225ffa..2d1059bcc 100644 --- a/prover/src/tables/page.rs +++ b/prover/src/tables/page.rs @@ -32,13 +32,13 @@ use std::collections::HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable}; @@ -250,19 +250,19 @@ pub fn generate_page_trace( pub(crate) fn static_zero_page_commitment(blowup_factor: u8) -> Option { match blowup_factor { 2 => Some([ - 0xf9, 0x80, 0x0e, 0x45, 0x72, 0x5a, 0x8e, 0x8e, 0x5e, 0xd7, 0x5b, 0x60, 0xce, 0xd0, - 0x8e, 0xa3, 0x27, 0x3b, 0x8a, 0xb5, 0x98, 0xc0, 0xe3, 0x16, 0xf6, 0x86, 0x75, 0x39, - 0x4c, 0xe5, 0x88, 0x5e, + 0x7d, 0x74, 0x85, 0xf0, 0x2b, 0x74, 0xe0, 0x3f, 0x14, 0x99, 0xb3, 0xa0, 0x5f, 0x1d, + 0x6e, 0xf2, 0x21, 0xff, 0xaf, 0x24, 0x7e, 0x30, 0xb0, 0xda, 0x48, 0x79, 0xe1, 0x43, + 0xee, 0xea, 0x6a, 0x0f, ]), 4 => Some([ - 0x0f, 0xb5, 0x0c, 0xa8, 0x3b, 0x69, 0x4f, 0x91, 0x60, 0xbf, 0x0d, 0x0d, 0xd3, 0x33, - 0x25, 0x38, 0x11, 0xbb, 0xf8, 0xfd, 0x54, 0xbd, 0x06, 0x7d, 0xd1, 0xeb, 0xa3, 0x58, - 0xe8, 0x37, 0x45, 0x56, + 0x5c, 0xcc, 0x5b, 0xb1, 0xe8, 0x11, 0x91, 0x81, 0xbd, 0xdd, 0x39, 0x40, 0x77, 0x87, + 0xdc, 0x98, 0x06, 0x06, 0x8c, 0x63, 0xcd, 0xfd, 0xf1, 0xda, 0x4a, 0x55, 0x31, 0x4d, + 0x6a, 0x16, 0x18, 0xd0, ]), 8 => Some([ - 0x4a, 0xfb, 0xc9, 0x6d, 0x46, 0x29, 0xa3, 0xc2, 0x36, 0x14, 0xd8, 0x24, 0x3e, 0xef, - 0x97, 0x3f, 0xe1, 0xda, 0x2b, 0xf7, 0x87, 0xb6, 0x54, 0xe1, 0xc6, 0x46, 0xc0, 0x85, - 0x96, 0x7f, 0x7f, 0x48, + 0xf0, 0xc0, 0x69, 0xed, 0xf8, 0x59, 0xd6, 0x56, 0x15, 0x3c, 0x2f, 0x93, 0x65, 0xd6, + 0xe9, 0xe9, 0x8e, 0xd1, 0x83, 0x94, 0xf9, 0x75, 0x59, 0xd1, 0xec, 0x16, 0xe1, 0x37, + 0xd5, 0x32, 0xd6, 0xd9, ]), _ => None, } @@ -315,7 +315,7 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -323,14 +323,9 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption }) .collect(); - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for page LDE"); - tree.root + root } /// Returns the zero-init PAGE preprocessed commitment. diff --git a/prover/src/tables/register.rs b/prover/src/tables/register.rs index 5a09fb2fa..0caac1ae7 100644 --- a/prover/src/tables/register.rs +++ b/prover/src/tables/register.rs @@ -20,13 +20,13 @@ use std::collections::HashMap; -use math::fft::bit_reversing::in_place_bit_reverse_permute; use math::polynomial::Polynomial; -use stark::config::{BatchedMerkleTree, Commitment}; +use stark::commitment::{ROWS_PER_LEAF, commit_bit_reversed}; +use stark::config::Commitment; use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing}; use stark::proof::options::ProofOptions; use stark::prover::evaluate_polynomial_on_lde_domain; -use stark::trace::{TraceTable, columns2rows}; +use stark::trace::TraceTable; use super::page::STACK_TOP; use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable}; @@ -220,7 +220,7 @@ pub fn compute_precomputed_commitment(options: &ProofOptions, entry_point: u64) let blowup_factor = options.blowup_factor as usize; let coset_offset = FE::from(options.coset_offset); - let mut lde_columns: Vec> = polys + let lde_columns: Vec> = polys .iter() .map(|poly| { evaluate_polynomial_on_lde_domain(poly, blowup_factor, num_rows, &coset_offset) @@ -228,14 +228,9 @@ pub fn compute_precomputed_commitment(options: &ProofOptions, entry_point: u64) }) .collect(); - for col in lde_columns.iter_mut() { - in_place_bit_reverse_permute(col); - } - - let lde_rows = columns2rows(lde_columns); - let tree = BatchedMerkleTree::::build(&lde_rows) + let (_, root) = commit_bit_reversed(&lde_columns, ROWS_PER_LEAF) .expect("Failed to build Merkle tree for register LDE"); - tree.root + root } /// Returns the preprocessed commitment for the REGISTER table. diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs index 43e6991cf..a761ac929 100644 --- a/prover/src/tests/decode_tests.rs +++ b/prover/src/tests/decode_tests.rs @@ -242,8 +242,8 @@ fn decode_commitment_zero_bytes_rejects() { /// AIR or FFT pipeline changes, this drifts and the test fails — /// regenerate via the `print_decode_commitment_for_sub` helper below. const SUB_DECODE_COMMITMENT_BLOWUP_2: [u8; 32] = [ - 0x60, 0x66, 0x0b, 0x18, 0x0d, 0x41, 0x08, 0xb3, 0x3a, 0x03, 0x99, 0x03, 0x8c, 0x9d, 0x12, 0x57, - 0x68, 0x8d, 0xed, 0x13, 0x60, 0xeb, 0x1d, 0x2b, 0xa8, 0xea, 0x1c, 0x76, 0xc9, 0xdd, 0x25, 0xaf, + 0xe9, 0x71, 0x68, 0xd6, 0x2e, 0xb1, 0xf6, 0x56, 0x61, 0x9d, 0x04, 0x6e, 0x65, 0xed, 0x63, 0x4a, + 0x27, 0xa3, 0x4d, 0xcb, 0x6c, 0x02, 0x11, 0xd7, 0x65, 0xc9, 0xc9, 0xfd, 0x59, 0x34, 0x41, 0x5f, ]; #[test]