From b181a74c077491d5d411ba40825c9f3b08ce5f67 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Wed, 24 Jun 2026 13:23:46 -0300 Subject: [PATCH 1/5] =?UTF-8?q?refactor(prover):=20LT/BRANCH=20one-row-per?= =?UTF-8?q?-op=20to=20match=20spec=20(=CE=BC=20is=20a=20Bit)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The spec types LT/BRANCH μ as a Bit (lt.toml, branch.toml), i.e. one trace row per operation with μ ∈ {0,1}. The impl deduplicated ops and stored a count in μ — a divergence from the spec (and an unsound count in a Bit-typed column). Drop the dedup: one row per op, μ = 1 (0 for padding). MUL/DVRM keep their dedup since the spec types those multiplicities as BaseField counts. Also makes LT/BRANCH trace gen deterministic (no HashMap iteration order) and aligns it with the bitwise collector (which already runs over raw ops). --- prover/src/tables/branch.rs | 20 ++----- prover/src/tables/lt.rs | 22 +++---- prover/src/tests/lt_tests.rs | 79 +++++++++++-------------- prover/src/tests/trace_builder_tests.rs | 31 +++++----- 4 files changed, 61 insertions(+), 91 deletions(-) diff --git a/prover/src/tables/branch.rs b/prover/src/tables/branch.rs index 1680b9edb..84f1b7b4e 100644 --- a/prover/src/tables/branch.rs +++ b/prover/src/tables/branch.rs @@ -153,21 +153,12 @@ impl BranchOperation { /// Generates the BRANCH trace table from a list of operations. /// -/// Duplicate operations (same pc, offset, register, jalr) are merged into a single row -/// with their multiplicities summed. The table is then padded to the next power of 2. +/// One row per operation with `μ = 1` (the spec types `μ` as a `Bit`). The table +/// is padded to the next power of two. pub fn generate_branch_trace( operations: &[BranchOperation], ) -> TraceTable { - use std::collections::HashMap; - - // Deduplicate operations: (pc, offset, register, jalr) -> multiplicity - let mut op_map: HashMap = HashMap::new(); - for op in operations { - *op_map.entry(op.clone()).or_insert(0) += 1; - } - - let unique_ops: Vec<_> = op_map.into_iter().collect(); - let num_rows = unique_ops.len().next_power_of_two().max(4); + let num_rows = operations.len().next_power_of_two().max(4); let mut trace = TraceTable::new_main( vec![FE::zero(); num_rows * cols::NUM_COLUMNS], cols::NUM_COLUMNS, @@ -175,7 +166,7 @@ pub fn generate_branch_trace( ); let table = &mut trace.main_table; - for (row_idx, (op, multiplicity)) in unique_ops.iter().enumerate() { + for (row_idx, op) in operations.iter().enumerate() { // Compute next_pc let next_pc_unmasked = op.compute_next_pc_unmasked(); let next_pc = op.compute_next_pc(); @@ -209,7 +200,8 @@ pub fn generate_branch_trace( &[next_pc_low_0, next_pc_low_1], ); table.set_byte(row_idx, cols::UNMASKED_LOW_BYTE, unmasked_low_byte); - table.set_u64(row_idx, cols::MU, *multiplicity); + // One row per op, so μ is the Bit the spec declares (1 real / 0 padding). + table.set_u64(row_idx, cols::MU, 1); } trace diff --git a/prover/src/tables/lt.rs b/prover/src/tables/lt.rs index 0b1a57616..8e6e1fcbe 100644 --- a/prover/src/tables/lt.rs +++ b/prover/src/tables/lt.rs @@ -156,21 +156,12 @@ impl LtOperation { /// Generates the LT trace table from a list of operations. /// -/// Duplicate operations (same lhs, rhs, signed) are merged into a single row -/// with their multiplicities summed. The table is then padded to the next power of 2. +/// One row per operation with `μ = 1` (the spec types `μ` as a `Bit`). The table +/// is padded to the next power of two. pub fn generate_lt_trace( operations: &[LtOperation], ) -> TraceTable { - use std::collections::HashMap; - - // Deduplicate operations: (lhs, rhs, signed) -> multiplicity - let mut op_map: HashMap = HashMap::new(); - for op in operations { - *op_map.entry(op.clone()).or_insert(0) += 1; - } - - let unique_ops: Vec<_> = op_map.into_iter().collect(); - let num_rows = unique_ops.len().next_power_of_two().max(4); + let num_rows = operations.len().next_power_of_two().max(4); let mut trace = TraceTable::new_main( vec![FE::zero(); num_rows * cols::NUM_COLUMNS], cols::NUM_COLUMNS, @@ -178,7 +169,7 @@ pub fn generate_lt_trace( ); let table = &mut trace.main_table; - for (row_idx, (op, multiplicity)) in unique_ops.iter().enumerate() { + for (row_idx, op) in operations.iter().enumerate() { // Store input columns table.set_dword_hhw(row_idx, cols::LHS_0, op.lhs); table.set_dword_hhw(row_idx, cols::RHS_0, op.rhs); @@ -205,8 +196,9 @@ pub fn generate_lt_trace( table.set_bool(row_idx, cols::INVERT, op.invert); table.set_bool(row_idx, cols::OUT, op.compute_out()); - // All LT lookups go through the unified ALU bus → single multiplicity. - table.set_u64(row_idx, cols::MU, *multiplicity); + // All LT lookups go through the unified ALU bus. One row per op, so + // μ is the Bit the spec declares (1 for real rows, 0 for padding). + table.set_u64(row_idx, cols::MU, 1); } trace diff --git a/prover/src/tests/lt_tests.rs b/prover/src/tests/lt_tests.rs index 77d8d1a89..48a070050 100644 --- a/prover/src/tests/lt_tests.rs +++ b/prover/src/tests/lt_tests.rs @@ -84,83 +84,72 @@ fn test_trace_generation() { } #[test] -fn test_multiplicity_aggregation() { - // Create 5 operations where (5, 10, UNSIGNED) appears 3 times +fn test_no_dedup_one_row_per_op() { + // 5 operations: (5, 10, UNSIGNED) appears 3×, (100, 200, UNSIGNED) 2×. + // Per the spec `μ` is a Bit, so there is NO deduplication: one row per op, + // each with μ = 1. let ops = vec![ - LtOperation::new(5, 10, UNSIGNED), // appears 1st time + LtOperation::new(5, 10, UNSIGNED), + LtOperation::new(100, 200, UNSIGNED), + LtOperation::new(5, 10, UNSIGNED), + LtOperation::new(5, 10, UNSIGNED), LtOperation::new(100, 200, UNSIGNED), - LtOperation::new(5, 10, UNSIGNED), // appears 2nd time - LtOperation::new(5, 10, UNSIGNED), // appears 3rd time - LtOperation::new(100, 200, UNSIGNED), // duplicate ]; let trace = generate_lt_trace(&ops); - // Should deduplicate to 2 unique rows, padded to 4 (minimum for FRI) - assert_eq!(trace.main_table.height, 4); + // 5 ops -> 5 rows, padded to the next power of two (8). + assert_eq!(trace.main_table.height, 8); - // Find each unique operation and check multiplicity - let mut found_5_10 = false; - let mut found_100_200 = false; - - for row_idx in 0..4 { + let mut count_5_10 = 0; + let mut count_100_200 = 0; + for row_idx in 0..trace.main_table.height { let row = trace.main_table.get_row(row_idx); if row[cols::LHS_0] == FE::from(5u64) && row[cols::RHS_0] == FE::from(10u64) { - assert_eq!( - row[cols::MU], - FE::from(3u64), - "Expected multiplicity 3 for (5, 10)" - ); - found_5_10 = true; + assert_eq!(row[cols::MU], FE::one(), "μ is the Bit 1 per row"); + count_5_10 += 1; } if row[cols::LHS_0] == FE::from(100u64) && row[cols::RHS_0] == FE::from(200u64) { - assert_eq!( - row[cols::MU], - FE::from(2u64), - "Expected multiplicity 2 for (100, 200)" - ); - found_100_200 = true; + assert_eq!(row[cols::MU], FE::one(), "μ is the Bit 1 per row"); + count_100_200 += 1; } } - assert!(found_5_10, "Row with lhs=5, rhs=10 not found"); - assert!(found_100_200, "Row with lhs=100, rhs=200 not found"); + assert_eq!(count_5_10, 3, "one row per (5, 10) op"); + assert_eq!(count_100_200, 2, "one row per (100, 200) op"); } #[test] -fn test_multiplicity_different_signed_flags() { - // Same lhs/rhs but different signed flag should be separate rows +fn test_signed_flag_separate_rows() { + // Same lhs/rhs, different signed flag, plus a repeat. No dedup (μ is a Bit): + // 3 ops -> 3 rows (2 unsigned, 1 signed), each μ = 1. let ops = vec![ - LtOperation::new(5, 10, UNSIGNED), // unsigned - LtOperation::new(5, 10, SIGNED), // signed - different operation! - LtOperation::new(5, 10, UNSIGNED), // unsigned again + LtOperation::new(5, 10, UNSIGNED), + LtOperation::new(5, 10, SIGNED), + LtOperation::new(5, 10, UNSIGNED), ]; let trace = generate_lt_trace(&ops); - // Should have 2 unique rows (unsigned and signed), padded to 4 (minimum for FRI) + // 3 ops -> 3 rows, padded to 4. assert_eq!(trace.main_table.height, 4); - let mut unsigned_mu = None; - let mut signed_mu = None; - - for row_idx in 0..4 { + let mut unsigned_rows = 0; + let mut signed_rows = 0; + for row_idx in 0..trace.main_table.height { let row = trace.main_table.get_row(row_idx); if row[cols::LHS_0] == FE::from(5u64) && row[cols::RHS_0] == FE::from(10u64) { + assert_eq!(row[cols::MU], FE::one(), "μ is the Bit 1 per row"); if row[cols::SIGNED] == FE::zero() { - unsigned_mu = Some(row[cols::MU]); + unsigned_rows += 1; } else { - signed_mu = Some(row[cols::MU]); + signed_rows += 1; } } } - assert_eq!( - unsigned_mu, - Some(FE::from(2u64)), - "Unsigned (5,10) should have mu=2" - ); - assert_eq!(signed_mu, Some(FE::one()), "Signed (5,10) should have mu=1"); + assert_eq!(unsigned_rows, 2, "two unsigned (5, 10) rows"); + assert_eq!(signed_rows, 1, "one signed (5, 10) row"); } #[test] diff --git a/prover/src/tests/trace_builder_tests.rs b/prover/src/tests/trace_builder_tests.rs index b3c1e1514..2d32e7cb9 100644 --- a/prover/src/tests/trace_builder_tests.rs +++ b/prover/src/tests/trace_builder_tests.rs @@ -164,7 +164,7 @@ fn test_lt_operations_collected() { } #[test] -fn test_lt_deduplication() { +fn test_lt_no_dedup_one_row_per_op() { let mut logs = vec![ make_slt_log(0x1000, 5, 10, 1), make_slt_log(0x1004, 5, 10, 1), // duplicate @@ -202,32 +202,29 @@ fn test_lt_deduplication() { let traces = Traces::from_logs(&logs, instructions, &Default::default()).unwrap(); - // The 3 identical SLT operations (5 < 10, signed) should be deduplicated. - // With MEMW timestamp ordering LT ops also added, the table is larger, - // but we can verify the SLT deduplication by finding the row with lhs=5, rhs=10. - let mut found_slt = false; + // Per the spec, `μ` is a Bit: one row per op (no deduplication). The 3 + // identical SLT operations (5 < 10, signed) each get their own row with μ = 1. + let mut slt_rows = 0; for row_idx in 0..traces.lts[0].main_table.height { let row = traces.lts[0].main_table.get_row(row_idx); - // Check for our SLT: lhs=5, rhs=10, signed=1 - // lhs is stored as DWordHHW: [half0, half1, word2] - // For value 5: half0=5, half1=0, word2=0 + // lhs is stored as DWordHHW: [half0, half1, word2]; value 5 => [5, 0, 0]. if row[lt::cols::LHS_0] == FE::from(5u64) && row[lt::cols::LHS_1] == FE::from(0u64) && row[lt::cols::LHS_2] == FE::from(0u64) && row[lt::cols::RHS_0] == FE::from(10u64) && row[lt::cols::SIGNED] == FE::from(1u64) { - // Found our SLT row - verify multiplicity is 3. Every LT lookup - // (including SLT) goes through the unified ALU bus and - // is counted in the single `MU` column. - assert_eq!(row[lt::cols::MU], FE::from(3u64)); - found_slt = true; - break; + assert_eq!( + row[lt::cols::MU], + FE::from(1u64), + "μ is the Bit 1 for each real LT row" + ); + slt_rows += 1; } } - assert!( - found_slt, - "SLT operation (5 < 10, signed) not found in LT table" + assert_eq!( + slt_rows, 3, + "expected one LT row per SLT op (spec: μ is a Bit, no dedup)" ); } From e2cd492a9b796b66264033292829986bc5ab3926 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Wed, 24 Jun 2026 13:44:55 -0300 Subject: [PATCH 2/5] perf(prover): parallelize per-table chunk generation chunk_and_generate built each table's chunks sequentially. Chunks are independent, so generate them with rayon (gated on the `parallel` feature); `collect` into Result> preserves chunk order, so output is byte-identical. Tables are still generated one at a time (no all-tables-parallel), keeping it compatible with sequential / on-demand commit. --- prover/src/tables/trace_builder.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs index 02371c1a0..fc776677a 100644 --- a/prover/src/tables/trace_builder.rs +++ b/prover/src/tables/trace_builder.rs @@ -2520,10 +2520,10 @@ struct CollectedOps { /// Chunk raw ops and generate one trace table per chunk. When `storage_mode` /// is `Disk`, each chunk's main table is spilled to mmap before the next chunk /// is built so peak heap usage stays bounded. -fn chunk_and_generate( +fn chunk_and_generate( ops: &[T], max_rows: usize, - generate: impl Fn(&[T]) -> TraceTable, + generate: impl Fn(&[T]) -> TraceTable + Sync, #[cfg(feature = "disk-spill")] storage_mode: StorageMode, ) -> Result>, Error> { let op_chunks: Vec<&[T]> = if ops.is_empty() { @@ -2531,8 +2531,11 @@ fn chunk_and_generate( } else { ops.chunks(max_rows).collect() }; - let mut tables = Vec::with_capacity(op_chunks.len()); - for chunk in op_chunks { + + // Each chunk is independent, so generate them concurrently. `collect` into a + // `Result>` preserves chunk order, so the output is byte-identical to + // the sequential build. + let gen_one = |chunk: &[T]| -> Result, Error> { #[allow(unused_mut)] let mut t = generate(chunk); #[cfg(feature = "disk-spill")] @@ -2541,9 +2544,18 @@ fn chunk_and_generate( .spill_to_disk() .map_err(|e| Error::Prover(format!("disk-spill trace: {e}")))?; } - tables.push(t); + Ok(t) + }; + + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + op_chunks.into_par_iter().map(gen_one).collect() + } + #[cfg(not(feature = "parallel"))] + { + op_chunks.into_iter().map(gen_one).collect() } - Ok(tables) } /// Phase 2: Collect and route all operations from CPU ops. From 22ad4cf9d16ca8e3c310a19ba14d15201f459b3c Mon Sep 17 00:00:00 2001 From: diegokingston Date: Wed, 24 Jun 2026 13:48:19 -0300 Subject: [PATCH 3/5] perf(prover): pre-size MUL/DVRM dedup HashMaps (with_capacity) Avoids rehashing as the dedup map grows. Byte-identical (same dedup result). --- prover/src/tables/dvrm.rs | 6 ++++-- prover/src/tables/mul.rs | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/prover/src/tables/dvrm.rs b/prover/src/tables/dvrm.rs index d3adbdc53..0ab7f4eac 100644 --- a/prover/src/tables/dvrm.rs +++ b/prover/src/tables/dvrm.rs @@ -287,8 +287,10 @@ impl DvrmOperation { pub fn generate_dvrm_trace( operations: &[(DvrmOperation, bool)], ) -> TraceTable { - // Deduplicate: (n, d, signed) -> (mu_q, mu_r) - let mut op_map: HashMap = HashMap::new(); + // Deduplicate: (n, d, signed) -> (mu_q, mu_r). + // Pre-size to the op count (an upper bound on unique ops) to avoid rehashing. + let mut op_map: HashMap = + HashMap::with_capacity(operations.len()); for (op, wants_remainder) in operations { let entry = op_map.entry(op.clone()).or_default(); diff --git a/prover/src/tables/mul.rs b/prover/src/tables/mul.rs index ba414dc63..3bba4da47 100644 --- a/prover/src/tables/mul.rs +++ b/prover/src/tables/mul.rs @@ -295,8 +295,10 @@ impl MulOperation { pub fn generate_mul_trace( operations: &[(MulOperation, bool)], ) -> TraceTable { - // Deduplicate: (lhs, lhs_signed, rhs, rhs_signed) -> (mu_lo, mu_hi) - let mut op_map: HashMap = HashMap::new(); + // Deduplicate: (lhs, lhs_signed, rhs, rhs_signed) -> (mu_lo, mu_hi). + // Pre-size to the op count (an upper bound on unique ops) to avoid rehashing. + let mut op_map: HashMap = + HashMap::with_capacity(operations.len()); for (op, wants_hi) in operations { let entry = op_map.entry(op.clone()).or_default(); From 270bb71c61e2905013a22a1cefa97183d7d28e66 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Wed, 24 Jun 2026 13:50:29 -0300 Subject: [PATCH 4/5] perf(prover): DVRM trace gen computes the remainder once per row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit generate_dvrm_trace called compute_remainder() ~6× per row (via n_sub_r/abs_r/ sign_r/sign_n_sub_r, each re-running the integer division). Derive sign_r, n_sub_r, sign_n_sub_r and abs_r from the single r computed up front. Byte-identical (same formulas). --- prover/src/tables/dvrm.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/prover/src/tables/dvrm.rs b/prover/src/tables/dvrm.rs index 0ab7f4eac..57eff713a 100644 --- a/prover/src/tables/dvrm.rs +++ b/prover/src/tables/dvrm.rs @@ -313,8 +313,12 @@ pub fn generate_dvrm_trace( for (row_idx, (op, multiplicities)) in unique_ops.iter().enumerate() { let q = op.compute_quotient(); let r = op.compute_remainder(); - let n_sub_r = op.n_sub_r(); - let abs_r = op.abs_r(); + // Derive the rest from the single `r` above instead of the helper methods, + // each of which recomputes the integer division internally (6×/row → 1×). + let sign_r = op.signed && (r >> 63) == 1; + let n_sub_r = op.n.wrapping_sub(r); + let sign_n_sub_r = op.signed && (n_sub_r >> 63) == 1; + let abs_r = DvrmOperation::abs_value(r, sign_r); let abs_d = op.abs_d(); // Fill n as DWordHL (4 halfwords) @@ -341,11 +345,11 @@ pub fn generate_dvrm_trace( // Fill n_sub_r as DWordHL (4 halfwords) table.set_dword_hl(row_idx, cols::N_SUB_R_0, n_sub_r); - table.set_bool(row_idx, cols::SIGN_N_SUB_R, op.sign_n_sub_r()); + table.set_bool(row_idx, cols::SIGN_N_SUB_R, sign_n_sub_r); table.set_bool(row_idx, cols::SIGN_N, op.sign_n()); table.set_bool(row_idx, cols::SIGN_D, op.sign_d()); table.set_bool(row_idx, cols::SIGN_Q, op.sign_q()); - table.set_bool(row_idx, cols::SIGN_R, op.sign_r()); + table.set_bool(row_idx, cols::SIGN_R, sign_r); // Multiplicities table.set_u64(row_idx, cols::MU_Q, multiplicities.mu_q); From 04356e1832dff5e1c990da00271c6391942a6c90 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Wed, 24 Jun 2026 14:29:48 -0300 Subject: [PATCH 5/5] perf(prover): collect state-free CPU chips in parallel (PHASE 2 split) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit collect_ops_from_cpu interleaved state-dependent work (MEMW/register/commit/ keccak/ecsm — which thread memory/register state, inherently serial) with state-free work (CPU range-check bitwise lookups + CPU32/LT/SHIFT dispatch, derived purely from each logged op). Split them: the state-free chips are now collected in a parallel pass (collect_state_free_ops, rayon under the parallel feature) while the serial loop keeps only the state-threaded work. For CPU-heavy programs the per-op bitwise range-check collection is a large state-free chunk that now runs off the serial path. Output is unchanged: LT/SHIFT/CPU32 stay in program order (ordered collect); the bitwise multiplicity accumulation is order-independent. --- prover/src/tables/trace_builder.rs | 120 +++++++++++++++++++---------- 1 file changed, 78 insertions(+), 42 deletions(-) diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs index fc776677a..0093a79fa 100644 --- a/prover/src/tables/trace_builder.rs +++ b/prover/src/tables/trace_builder.rs @@ -355,6 +355,71 @@ fn collect_cpu_ops( /// /// Returns: (memw_ops, load_ops, lt_ops, shift_ops, bitwise_ops, commit_ops, keccak_ops, /// cpu32_ops, ecsm_ops, ec_scalar_ops, ecdas_ops) +/// Collect the chips that depend only on each `CpuOperation` (no memory/register +/// state): the CPU range-check bitwise lookups plus the CPU32 / LT / SHIFT +/// dispatch. Parallel under the `parallel` feature; results stay in program +/// order, matching the sequential build. +fn collect_state_free_ops( + cpu_ops: &[CpuOperation], +) -> ( + Vec, + Vec, + Vec, + Vec, +) { + let lt = |op: &CpuOperation| -> Option { + let f = op.decode.fields; + (!f.word_instr && f.is_lt()).then(|| { + LtOperation::new_with_invert(op.rv1, op.arg2, f.alu_signed(), f.alu_signed2_or_invert()) + }) + }; + let shift = |op: &CpuOperation| -> Option { + let f = op.decode.fields; + (!f.word_instr && f.is_shift()).then(|| { + ShiftOperation::new( + op.rv1, + op.arg2, + f.alu_signed2_or_invert(), + f.alu_signed(), + f.word_instr, + ) + }) + }; + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + ( + cpu_ops + .par_iter() + .flat_map_iter(|op| op.collect_bitwise_ops()) + .collect(), + cpu_ops + .par_iter() + .filter(|op| op.decode.fields.word_instr) + .map(build_cpu32_op) + .collect(), + cpu_ops.par_iter().filter_map(lt).collect(), + cpu_ops.par_iter().filter_map(shift).collect(), + ) + } + #[cfg(not(feature = "parallel"))] + { + ( + cpu_ops + .iter() + .flat_map(|op| op.collect_bitwise_ops()) + .collect(), + cpu_ops + .iter() + .filter(|op| op.decode.fields.word_instr) + .map(build_cpu32_op) + .collect(), + cpu_ops.iter().filter_map(lt).collect(), + cpu_ops.iter().filter_map(shift).collect(), + ) + } +} + #[allow(clippy::type_complexity)] fn collect_ops_from_cpu( cpu_ops: &[CpuOperation], @@ -373,14 +438,16 @@ fn collect_ops_from_cpu( Vec, Vec, ) { + // State-free chips (CPU range-check bitwise lookups + CPU32/LT/SHIFT dispatch) + // are collected in parallel; the loop below only does the state-dependent work + // (MEMW/register/commit/keccak/ecsm — which thread memory/register state). + let (cpu_bitwise_ops, cpu32_ops, lt_ops, shift_ops) = collect_state_free_ops(cpu_ops); + let mut memw_ops = Vec::with_capacity(cpu_ops.len() * 3); let mut load_ops = Vec::with_capacity(cpu_ops.len() / 8 + 1); - let mut lt_ops = Vec::with_capacity(cpu_ops.len() / 10 + 1); - let mut shift_ops = Vec::with_capacity(cpu_ops.len() / 10 + 1); let mut bitwise_ops = Vec::with_capacity(cpu_ops.len() * 4); let mut commit_ops = Vec::new(); let mut keccak_ops = Vec::new(); - let mut cpu32_ops = Vec::new(); let mut ecsm_ops = Vec::new(); let mut ec_scalar_ops = Vec::new(); let mut ecdas_ops = Vec::new(); @@ -388,12 +455,9 @@ fn collect_ops_from_cpu( let mut commit_ecall_count = 0u32; for op in cpu_ops { - // Word (`*W`) instructions delegate to the CPU32 table (built in program - // order; its register accesses are still emitted via the shared register - // collector below so the MEMW table balances). - if op.decode.fields.word_instr { - cpu32_ops.push(build_cpu32_op(op)); - } + // CPU32 register accesses are still emitted via the shared register + // collector below so the MEMW table balances; the CPU32 op itself is + // built in the state-free parallel pass. // --- MEMW and LOAD (require state tracking, order matters) --- @@ -474,41 +538,13 @@ fn collect_ops_from_cpu( ec_scalar_ops.extend(ec_scalar_rows); ecdas_ops.extend(ecdas_rows); } - - // --- ALU chip dispatch (no state tracking) --- - // Word (`*W`) instructions are delegated to CPU32 (which itself drives - // the ALU chips); the main CPU does not send the ALU bus for them, so we - // must not emit chip ops here. CPU32 op-generation is B5b. - let f = op.decode.fields; - if !f.word_instr { - // LT: SLT / BLT / BGE, dispatched on the unified ALU bus. `invert` - // (BGE/BGEU) is applied inside the LT chip (`out = lt XOR invert`). - if f.is_lt() { - lt_ops.push(LtOperation::new_with_invert( - op.rv1, - op.arg2, - f.alu_signed(), - f.alu_signed2_or_invert(), - )); - } - // SHIFT: SLL/SRL/SRA. direction = invert bit (0 = left, 1 = right). - // The full arg2 goes on the ALU bus as in2; the chip uses its low - // byte for the (mod 32/64) computation. - if f.is_shift() { - shift_ops.push(ShiftOperation::new( - op.rv1, - op.arg2, - f.alu_signed2_or_invert(), - f.alu_signed(), - f.word_instr, - )); - } - } - - // Collect CPU range-check bitwise lookups (ARE_BYTES + IS_HALF). - bitwise_ops.extend(op.collect_bitwise_ops()); } + // CPU range-check lookups (ARE_BYTES + IS_HALF) were collected in the + // state-free pass above; merge them in. Order is irrelevant for the bitwise + // multiplicity accumulation. + bitwise_ops.extend(cpu_bitwise_ops); + // Each ecall generates count+1 operations (count real rows + 1 end row) debug_assert_eq!( commit_ops.len(),