From b181a74c077491d5d411ba40825c9f3b08ce5f67 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 24 Jun 2026 13:23:46 -0300
Subject: [PATCH 1/5] =?UTF-8?q?refactor(prover):=20LT/BRANCH=20one-row-per?=
 =?UTF-8?q?-op=20to=20match=20spec=20(=CE=BC=20is=20a=20Bit)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The spec types LT/BRANCH μ as a Bit (lt.toml, branch.toml), i.e. one trace
row per operation with μ ∈ {0,1}. The impl deduplicated ops and stored a
count in μ — a divergence from the spec (and an unsound count in a Bit-typed
column). Drop the dedup: one row per op, μ = 1 (0 for padding). MUL/DVRM keep
their dedup since the spec types those multiplicities as BaseField counts.

Also makes LT/BRANCH trace gen deterministic (no HashMap iteration order) and
aligns it with the bitwise collector (which already runs over raw ops).
---
 prover/src/tables/branch.rs             | 20 ++-----
 prover/src/tables/lt.rs                 | 22 +++----
 prover/src/tests/lt_tests.rs            | 79 +++++++++++--------------
 prover/src/tests/trace_builder_tests.rs | 31 +++++-----
 4 files changed, 61 insertions(+), 91 deletions(-)

diff --git a/prover/src/tables/branch.rs b/prover/src/tables/branch.rs
index 1680b9edb..84f1b7b4e 100644
--- a/prover/src/tables/branch.rs
+++ b/prover/src/tables/branch.rs
@@ -153,21 +153,12 @@ impl BranchOperation {
 
 /// Generates the BRANCH trace table from a list of operations.
 ///
-/// Duplicate operations (same pc, offset, register, jalr) are merged into a single row
-/// with their multiplicities summed. The table is then padded to the next power of 2.
+/// One row per operation with `μ = 1` (the spec types `μ` as a `Bit`). The table
+/// is padded to the next power of two.
 pub fn generate_branch_trace(
     operations: &[BranchOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
-    use std::collections::HashMap;
-
-    // Deduplicate operations: (pc, offset, register, jalr) -> multiplicity
-    let mut op_map: HashMap<BranchOperation, u64> = HashMap::new();
-    for op in operations {
-        *op_map.entry(op.clone()).or_insert(0) += 1;
-    }
-
-    let unique_ops: Vec<_> = op_map.into_iter().collect();
-    let num_rows = unique_ops.len().next_power_of_two().max(4);
+    let num_rows = operations.len().next_power_of_two().max(4);
     let mut trace = TraceTable::new_main(
         vec![FE::zero(); num_rows * cols::NUM_COLUMNS],
         cols::NUM_COLUMNS,
@@ -175,7 +166,7 @@ pub fn generate_branch_trace(
     );
     let table = &mut trace.main_table;
 
-    for (row_idx, (op, multiplicity)) in unique_ops.iter().enumerate() {
+    for (row_idx, op) in operations.iter().enumerate() {
         // Compute next_pc
         let next_pc_unmasked = op.compute_next_pc_unmasked();
         let next_pc = op.compute_next_pc();
@@ -209,7 +200,8 @@ pub fn generate_branch_trace(
             &[next_pc_low_0, next_pc_low_1],
         );
         table.set_byte(row_idx, cols::UNMASKED_LOW_BYTE, unmasked_low_byte);
-        table.set_u64(row_idx, cols::MU, *multiplicity);
+        // One row per op, so μ is the Bit the spec declares (1 real / 0 padding).
+        table.set_u64(row_idx, cols::MU, 1);
     }
 
     trace
diff --git a/prover/src/tables/lt.rs b/prover/src/tables/lt.rs
index 0b1a57616..8e6e1fcbe 100644
--- a/prover/src/tables/lt.rs
+++ b/prover/src/tables/lt.rs
@@ -156,21 +156,12 @@ impl LtOperation {
 
 /// Generates the LT trace table from a list of operations.
 ///
-/// Duplicate operations (same lhs, rhs, signed) are merged into a single row
-/// with their multiplicities summed. The table is then padded to the next power of 2.
+/// One row per operation with `μ = 1` (the spec types `μ` as a `Bit`). The table
+/// is padded to the next power of two.
 pub fn generate_lt_trace(
     operations: &[LtOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
-    use std::collections::HashMap;
-
-    // Deduplicate operations: (lhs, rhs, signed) -> multiplicity
-    let mut op_map: HashMap<LtOperation, u64> = HashMap::new();
-    for op in operations {
-        *op_map.entry(op.clone()).or_insert(0) += 1;
-    }
-
-    let unique_ops: Vec<_> = op_map.into_iter().collect();
-    let num_rows = unique_ops.len().next_power_of_two().max(4);
+    let num_rows = operations.len().next_power_of_two().max(4);
     let mut trace = TraceTable::new_main(
         vec![FE::zero(); num_rows * cols::NUM_COLUMNS],
         cols::NUM_COLUMNS,
@@ -178,7 +169,7 @@ pub fn generate_lt_trace(
     );
     let table = &mut trace.main_table;
 
-    for (row_idx, (op, multiplicity)) in unique_ops.iter().enumerate() {
+    for (row_idx, op) in operations.iter().enumerate() {
         // Store input columns
         table.set_dword_hhw(row_idx, cols::LHS_0, op.lhs);
         table.set_dword_hhw(row_idx, cols::RHS_0, op.rhs);
@@ -205,8 +196,9 @@ pub fn generate_lt_trace(
         table.set_bool(row_idx, cols::INVERT, op.invert);
         table.set_bool(row_idx, cols::OUT, op.compute_out());
 
-        // All LT lookups go through the unified ALU bus → single multiplicity.
-        table.set_u64(row_idx, cols::MU, *multiplicity);
+        // All LT lookups go through the unified ALU bus. One row per op, so
+        // μ is the Bit the spec declares (1 for real rows, 0 for padding).
+        table.set_u64(row_idx, cols::MU, 1);
     }
 
     trace
diff --git a/prover/src/tests/lt_tests.rs b/prover/src/tests/lt_tests.rs
index 77d8d1a89..48a070050 100644
--- a/prover/src/tests/lt_tests.rs
+++ b/prover/src/tests/lt_tests.rs
@@ -84,83 +84,72 @@ fn test_trace_generation() {
 }
 
 #[test]
-fn test_multiplicity_aggregation() {
-    // Create 5 operations where (5, 10, UNSIGNED) appears 3 times
+fn test_no_dedup_one_row_per_op() {
+    // 5 operations: (5, 10, UNSIGNED) appears 3×, (100, 200, UNSIGNED) 2×.
+    // Per the spec `μ` is a Bit, so there is NO deduplication: one row per op,
+    // each with μ = 1.
     let ops = vec![
-        LtOperation::new(5, 10, UNSIGNED), // appears 1st time
+        LtOperation::new(5, 10, UNSIGNED),
+        LtOperation::new(100, 200, UNSIGNED),
+        LtOperation::new(5, 10, UNSIGNED),
+        LtOperation::new(5, 10, UNSIGNED),
         LtOperation::new(100, 200, UNSIGNED),
-        LtOperation::new(5, 10, UNSIGNED),    // appears 2nd time
-        LtOperation::new(5, 10, UNSIGNED),    // appears 3rd time
-        LtOperation::new(100, 200, UNSIGNED), // duplicate
     ];
 
     let trace = generate_lt_trace(&ops);
 
-    // Should deduplicate to 2 unique rows, padded to 4 (minimum for FRI)
-    assert_eq!(trace.main_table.height, 4);
+    // 5 ops -> 5 rows, padded to the next power of two (8).
+    assert_eq!(trace.main_table.height, 8);
 
-    // Find each unique operation and check multiplicity
-    let mut found_5_10 = false;
-    let mut found_100_200 = false;
-
-    for row_idx in 0..4 {
+    let mut count_5_10 = 0;
+    let mut count_100_200 = 0;
+    for row_idx in 0..trace.main_table.height {
         let row = trace.main_table.get_row(row_idx);
         if row[cols::LHS_0] == FE::from(5u64) && row[cols::RHS_0] == FE::from(10u64) {
-            assert_eq!(
-                row[cols::MU],
-                FE::from(3u64),
-                "Expected multiplicity 3 for (5, 10)"
-            );
-            found_5_10 = true;
+            assert_eq!(row[cols::MU], FE::one(), "μ is the Bit 1 per row");
+            count_5_10 += 1;
         }
         if row[cols::LHS_0] == FE::from(100u64) && row[cols::RHS_0] == FE::from(200u64) {
-            assert_eq!(
-                row[cols::MU],
-                FE::from(2u64),
-                "Expected multiplicity 2 for (100, 200)"
-            );
-            found_100_200 = true;
+            assert_eq!(row[cols::MU], FE::one(), "μ is the Bit 1 per row");
+            count_100_200 += 1;
         }
     }
 
-    assert!(found_5_10, "Row with lhs=5, rhs=10 not found");
-    assert!(found_100_200, "Row with lhs=100, rhs=200 not found");
+    assert_eq!(count_5_10, 3, "one row per (5, 10) op");
+    assert_eq!(count_100_200, 2, "one row per (100, 200) op");
 }
 
 #[test]
-fn test_multiplicity_different_signed_flags() {
-    // Same lhs/rhs but different signed flag should be separate rows
+fn test_signed_flag_separate_rows() {
+    // Same lhs/rhs, different signed flag, plus a repeat. No dedup (μ is a Bit):
+    // 3 ops -> 3 rows (2 unsigned, 1 signed), each μ = 1.
     let ops = vec![
-        LtOperation::new(5, 10, UNSIGNED), // unsigned
-        LtOperation::new(5, 10, SIGNED),   // signed - different operation!
-        LtOperation::new(5, 10, UNSIGNED), // unsigned again
+        LtOperation::new(5, 10, UNSIGNED),
+        LtOperation::new(5, 10, SIGNED),
+        LtOperation::new(5, 10, UNSIGNED),
     ];
 
     let trace = generate_lt_trace(&ops);
 
-    // Should have 2 unique rows (unsigned and signed), padded to 4 (minimum for FRI)
+    // 3 ops -> 3 rows, padded to 4.
     assert_eq!(trace.main_table.height, 4);
 
-    let mut unsigned_mu = None;
-    let mut signed_mu = None;
-
-    for row_idx in 0..4 {
+    let mut unsigned_rows = 0;
+    let mut signed_rows = 0;
+    for row_idx in 0..trace.main_table.height {
         let row = trace.main_table.get_row(row_idx);
         if row[cols::LHS_0] == FE::from(5u64) && row[cols::RHS_0] == FE::from(10u64) {
+            assert_eq!(row[cols::MU], FE::one(), "μ is the Bit 1 per row");
             if row[cols::SIGNED] == FE::zero() {
-                unsigned_mu = Some(row[cols::MU]);
+                unsigned_rows += 1;
             } else {
-                signed_mu = Some(row[cols::MU]);
+                signed_rows += 1;
             }
         }
     }
 
-    assert_eq!(
-        unsigned_mu,
-        Some(FE::from(2u64)),
-        "Unsigned (5,10) should have mu=2"
-    );
-    assert_eq!(signed_mu, Some(FE::one()), "Signed (5,10) should have mu=1");
+    assert_eq!(unsigned_rows, 2, "two unsigned (5, 10) rows");
+    assert_eq!(signed_rows, 1, "one signed (5, 10) row");
 }
 
 #[test]
diff --git a/prover/src/tests/trace_builder_tests.rs b/prover/src/tests/trace_builder_tests.rs
index b3c1e1514..2d32e7cb9 100644
--- a/prover/src/tests/trace_builder_tests.rs
+++ b/prover/src/tests/trace_builder_tests.rs
@@ -164,7 +164,7 @@ fn test_lt_operations_collected() {
 }
 
 #[test]
-fn test_lt_deduplication() {
+fn test_lt_no_dedup_one_row_per_op() {
     let mut logs = vec![
         make_slt_log(0x1000, 5, 10, 1),
         make_slt_log(0x1004, 5, 10, 1), // duplicate
@@ -202,32 +202,29 @@ fn test_lt_deduplication() {
 
     let traces = Traces::from_logs(&logs, instructions, &Default::default()).unwrap();
 
-    // The 3 identical SLT operations (5 < 10, signed) should be deduplicated.
-    // With MEMW timestamp ordering LT ops also added, the table is larger,
-    // but we can verify the SLT deduplication by finding the row with lhs=5, rhs=10.
-    let mut found_slt = false;
+    // Per the spec, `μ` is a Bit: one row per op (no deduplication). The 3
+    // identical SLT operations (5 < 10, signed) each get their own row with μ = 1.
+    let mut slt_rows = 0;
     for row_idx in 0..traces.lts[0].main_table.height {
         let row = traces.lts[0].main_table.get_row(row_idx);
-        // Check for our SLT: lhs=5, rhs=10, signed=1
-        // lhs is stored as DWordHHW: [half0, half1, word2]
-        // For value 5: half0=5, half1=0, word2=0
+        // lhs is stored as DWordHHW: [half0, half1, word2]; value 5 => [5, 0, 0].
         if row[lt::cols::LHS_0] == FE::from(5u64)
             && row[lt::cols::LHS_1] == FE::from(0u64)
             && row[lt::cols::LHS_2] == FE::from(0u64)
             && row[lt::cols::RHS_0] == FE::from(10u64)
             && row[lt::cols::SIGNED] == FE::from(1u64)
         {
-            // Found our SLT row - verify multiplicity is 3. Every LT lookup
-            // (including SLT) goes through the unified ALU bus and
-            // is counted in the single `MU` column.
-            assert_eq!(row[lt::cols::MU], FE::from(3u64));
-            found_slt = true;
-            break;
+            assert_eq!(
+                row[lt::cols::MU],
+                FE::from(1u64),
+                "μ is the Bit 1 for each real LT row"
+            );
+            slt_rows += 1;
         }
     }
-    assert!(
-        found_slt,
-        "SLT operation (5 < 10, signed) not found in LT table"
+    assert_eq!(
+        slt_rows, 3,
+        "expected one LT row per SLT op (spec: μ is a Bit, no dedup)"
     );
 }
 

From e2cd492a9b796b66264033292829986bc5ab3926 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 24 Jun 2026 13:44:55 -0300
Subject: [PATCH 2/5] perf(prover): parallelize per-table chunk generation

chunk_and_generate built each table's chunks sequentially. Chunks are
independent, so generate them with rayon (gated on the `parallel` feature);
`collect` into Result<Vec<_>> preserves chunk order, so output is byte-identical.
Tables are still generated one at a time (no all-tables-parallel), keeping it
compatible with sequential / on-demand commit.
---
 prover/src/tables/trace_builder.rs | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs
index 02371c1a0..fc776677a 100644
--- a/prover/src/tables/trace_builder.rs
+++ b/prover/src/tables/trace_builder.rs
@@ -2520,10 +2520,10 @@ struct CollectedOps {
 /// Chunk raw ops and generate one trace table per chunk. When `storage_mode`
 /// is `Disk`, each chunk's main table is spilled to mmap before the next chunk
 /// is built so peak heap usage stays bounded.
-fn chunk_and_generate<T>(
+fn chunk_and_generate<T: Sync>(
     ops: &[T],
     max_rows: usize,
-    generate: impl Fn(&[T]) -> TraceTable<GoldilocksField, GoldilocksExtension>,
+    generate: impl Fn(&[T]) -> TraceTable<GoldilocksField, GoldilocksExtension> + Sync,
     #[cfg(feature = "disk-spill")] storage_mode: StorageMode,
 ) -> Result<Vec<TraceTable<GoldilocksField, GoldilocksExtension>>, Error> {
     let op_chunks: Vec<&[T]> = if ops.is_empty() {
@@ -2531,8 +2531,11 @@ fn chunk_and_generate<T>(
     } else {
         ops.chunks(max_rows).collect()
     };
-    let mut tables = Vec::with_capacity(op_chunks.len());
-    for chunk in op_chunks {
+
+    // Each chunk is independent, so generate them concurrently. `collect` into a
+    // `Result<Vec<_>>` preserves chunk order, so the output is byte-identical to
+    // the sequential build.
+    let gen_one = |chunk: &[T]| -> Result<TraceTable<GoldilocksField, GoldilocksExtension>, Error> {
         #[allow(unused_mut)]
         let mut t = generate(chunk);
         #[cfg(feature = "disk-spill")]
@@ -2541,9 +2544,18 @@ fn chunk_and_generate<T>(
                 .spill_to_disk()
                 .map_err(|e| Error::Prover(format!("disk-spill trace: {e}")))?;
         }
-        tables.push(t);
+        Ok(t)
+    };
+
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        op_chunks.into_par_iter().map(gen_one).collect()
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        op_chunks.into_iter().map(gen_one).collect()
     }
-    Ok(tables)
 }
 
 /// Phase 2: Collect and route all operations from CPU ops.

From 22ad4cf9d16ca8e3c310a19ba14d15201f459b3c Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 24 Jun 2026 13:48:19 -0300
Subject: [PATCH 3/5] perf(prover): pre-size MUL/DVRM dedup HashMaps
 (with_capacity)

Avoids rehashing as the dedup map grows. Byte-identical (same dedup result).
---
 prover/src/tables/dvrm.rs | 6 ++++--
 prover/src/tables/mul.rs  | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/prover/src/tables/dvrm.rs b/prover/src/tables/dvrm.rs
index d3adbdc53..0ab7f4eac 100644
--- a/prover/src/tables/dvrm.rs
+++ b/prover/src/tables/dvrm.rs
@@ -287,8 +287,10 @@ impl DvrmOperation {
 pub fn generate_dvrm_trace(
     operations: &[(DvrmOperation, bool)],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
-    // Deduplicate: (n, d, signed) -> (mu_q, mu_r)
-    let mut op_map: HashMap<DvrmOperation, DvrmMultiplicities> = HashMap::new();
+    // Deduplicate: (n, d, signed) -> (mu_q, mu_r).
+    // Pre-size to the op count (an upper bound on unique ops) to avoid rehashing.
+    let mut op_map: HashMap<DvrmOperation, DvrmMultiplicities> =
+        HashMap::with_capacity(operations.len());
 
     for (op, wants_remainder) in operations {
         let entry = op_map.entry(op.clone()).or_default();
diff --git a/prover/src/tables/mul.rs b/prover/src/tables/mul.rs
index ba414dc63..3bba4da47 100644
--- a/prover/src/tables/mul.rs
+++ b/prover/src/tables/mul.rs
@@ -295,8 +295,10 @@ impl MulOperation {
 pub fn generate_mul_trace(
     operations: &[(MulOperation, bool)],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
-    // Deduplicate: (lhs, lhs_signed, rhs, rhs_signed) -> (mu_lo, mu_hi)
-    let mut op_map: HashMap<MulOperation, MulMultiplicities> = HashMap::new();
+    // Deduplicate: (lhs, lhs_signed, rhs, rhs_signed) -> (mu_lo, mu_hi).
+    // Pre-size to the op count (an upper bound on unique ops) to avoid rehashing.
+    let mut op_map: HashMap<MulOperation, MulMultiplicities> =
+        HashMap::with_capacity(operations.len());
 
     for (op, wants_hi) in operations {
         let entry = op_map.entry(op.clone()).or_default();

From 270bb71c61e2905013a22a1cefa97183d7d28e66 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 24 Jun 2026 13:50:29 -0300
Subject: [PATCH 4/5] perf(prover): DVRM trace gen computes the remainder once
 per row
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

generate_dvrm_trace called compute_remainder() ~6× per row (via n_sub_r/abs_r/
sign_r/sign_n_sub_r, each re-running the integer division). Derive sign_r,
n_sub_r, sign_n_sub_r and abs_r from the single r computed up front.
Byte-identical (same formulas).
---
 prover/src/tables/dvrm.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/prover/src/tables/dvrm.rs b/prover/src/tables/dvrm.rs
index 0ab7f4eac..57eff713a 100644
--- a/prover/src/tables/dvrm.rs
+++ b/prover/src/tables/dvrm.rs
@@ -313,8 +313,12 @@ pub fn generate_dvrm_trace(
     for (row_idx, (op, multiplicities)) in unique_ops.iter().enumerate() {
         let q = op.compute_quotient();
         let r = op.compute_remainder();
-        let n_sub_r = op.n_sub_r();
-        let abs_r = op.abs_r();
+        // Derive the rest from the single `r` above instead of the helper methods,
+        // each of which recomputes the integer division internally (6×/row → 1×).
+        let sign_r = op.signed && (r >> 63) == 1;
+        let n_sub_r = op.n.wrapping_sub(r);
+        let sign_n_sub_r = op.signed && (n_sub_r >> 63) == 1;
+        let abs_r = DvrmOperation::abs_value(r, sign_r);
         let abs_d = op.abs_d();
 
         // Fill n as DWordHL (4 halfwords)
@@ -341,11 +345,11 @@ pub fn generate_dvrm_trace(
         // Fill n_sub_r as DWordHL (4 halfwords)
         table.set_dword_hl(row_idx, cols::N_SUB_R_0, n_sub_r);
 
-        table.set_bool(row_idx, cols::SIGN_N_SUB_R, op.sign_n_sub_r());
+        table.set_bool(row_idx, cols::SIGN_N_SUB_R, sign_n_sub_r);
         table.set_bool(row_idx, cols::SIGN_N, op.sign_n());
         table.set_bool(row_idx, cols::SIGN_D, op.sign_d());
         table.set_bool(row_idx, cols::SIGN_Q, op.sign_q());
-        table.set_bool(row_idx, cols::SIGN_R, op.sign_r());
+        table.set_bool(row_idx, cols::SIGN_R, sign_r);
 
         // Multiplicities
         table.set_u64(row_idx, cols::MU_Q, multiplicities.mu_q);

From 04356e1832dff5e1c990da00271c6391942a6c90 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 24 Jun 2026 14:29:48 -0300
Subject: [PATCH 5/5] perf(prover): collect state-free CPU chips in parallel
 (PHASE 2 split)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

collect_ops_from_cpu interleaved state-dependent work (MEMW/register/commit/
keccak/ecsm — which thread memory/register state, inherently serial) with
state-free work (CPU range-check bitwise lookups + CPU32/LT/SHIFT dispatch,
derived purely from each logged op).

Split them: the state-free chips are now collected in a parallel pass
(collect_state_free_ops, rayon under the parallel feature) while the serial
loop keeps only the state-threaded work. For CPU-heavy programs the per-op
bitwise range-check collection is a large state-free chunk that now runs off
the serial path. Output is unchanged: LT/SHIFT/CPU32 stay in program order
(ordered collect); the bitwise multiplicity accumulation is order-independent.
---
 prover/src/tables/trace_builder.rs | 120 +++++++++++++++++++----------
 1 file changed, 78 insertions(+), 42 deletions(-)

diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs
index fc776677a..0093a79fa 100644
--- a/prover/src/tables/trace_builder.rs
+++ b/prover/src/tables/trace_builder.rs
@@ -355,6 +355,71 @@ fn collect_cpu_ops(
 ///
 /// Returns: (memw_ops, load_ops, lt_ops, shift_ops, bitwise_ops, commit_ops, keccak_ops,
 /// cpu32_ops, ecsm_ops, ec_scalar_ops, ecdas_ops)
+/// Collect the chips that depend only on each `CpuOperation` (no memory/register
+/// state): the CPU range-check bitwise lookups plus the CPU32 / LT / SHIFT
+/// dispatch. Parallel under the `parallel` feature; results stay in program
+/// order, matching the sequential build.
+fn collect_state_free_ops(
+    cpu_ops: &[CpuOperation],
+) -> (
+    Vec<BitwiseOperation>,
+    Vec<cpu32::Cpu32Operation>,
+    Vec<LtOperation>,
+    Vec<ShiftOperation>,
+) {
+    let lt = |op: &CpuOperation| -> Option<LtOperation> {
+        let f = op.decode.fields;
+        (!f.word_instr && f.is_lt()).then(|| {
+            LtOperation::new_with_invert(op.rv1, op.arg2, f.alu_signed(), f.alu_signed2_or_invert())
+        })
+    };
+    let shift = |op: &CpuOperation| -> Option<ShiftOperation> {
+        let f = op.decode.fields;
+        (!f.word_instr && f.is_shift()).then(|| {
+            ShiftOperation::new(
+                op.rv1,
+                op.arg2,
+                f.alu_signed2_or_invert(),
+                f.alu_signed(),
+                f.word_instr,
+            )
+        })
+    };
+    #[cfg(feature = "parallel")]
+    {
+        use rayon::prelude::*;
+        (
+            cpu_ops
+                .par_iter()
+                .flat_map_iter(|op| op.collect_bitwise_ops())
+                .collect(),
+            cpu_ops
+                .par_iter()
+                .filter(|op| op.decode.fields.word_instr)
+                .map(build_cpu32_op)
+                .collect(),
+            cpu_ops.par_iter().filter_map(lt).collect(),
+            cpu_ops.par_iter().filter_map(shift).collect(),
+        )
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        (
+            cpu_ops
+                .iter()
+                .flat_map(|op| op.collect_bitwise_ops())
+                .collect(),
+            cpu_ops
+                .iter()
+                .filter(|op| op.decode.fields.word_instr)
+                .map(build_cpu32_op)
+                .collect(),
+            cpu_ops.iter().filter_map(lt).collect(),
+            cpu_ops.iter().filter_map(shift).collect(),
+        )
+    }
+}
+
 #[allow(clippy::type_complexity)]
 fn collect_ops_from_cpu(
     cpu_ops: &[CpuOperation],
@@ -373,14 +438,16 @@ fn collect_ops_from_cpu(
     Vec<ec_scalar::EcScalarOperation>,
     Vec<ecdas::EcdasOperation>,
 ) {
+    // State-free chips (CPU range-check bitwise lookups + CPU32/LT/SHIFT dispatch)
+    // are collected in parallel; the loop below only does the state-dependent work
+    // (MEMW/register/commit/keccak/ecsm — which thread memory/register state).
+    let (cpu_bitwise_ops, cpu32_ops, lt_ops, shift_ops) = collect_state_free_ops(cpu_ops);
+
     let mut memw_ops = Vec::with_capacity(cpu_ops.len() * 3);
     let mut load_ops = Vec::with_capacity(cpu_ops.len() / 8 + 1);
-    let mut lt_ops = Vec::with_capacity(cpu_ops.len() / 10 + 1);
-    let mut shift_ops = Vec::with_capacity(cpu_ops.len() / 10 + 1);
     let mut bitwise_ops = Vec::with_capacity(cpu_ops.len() * 4);
     let mut commit_ops = Vec::new();
     let mut keccak_ops = Vec::new();
-    let mut cpu32_ops = Vec::new();
     let mut ecsm_ops = Vec::new();
     let mut ec_scalar_ops = Vec::new();
     let mut ecdas_ops = Vec::new();
@@ -388,12 +455,9 @@ fn collect_ops_from_cpu(
     let mut commit_ecall_count = 0u32;
 
     for op in cpu_ops {
-        // Word (`*W`) instructions delegate to the CPU32 table (built in program
-        // order; its register accesses are still emitted via the shared register
-        // collector below so the MEMW table balances).
-        if op.decode.fields.word_instr {
-            cpu32_ops.push(build_cpu32_op(op));
-        }
+        // CPU32 register accesses are still emitted via the shared register
+        // collector below so the MEMW table balances; the CPU32 op itself is
+        // built in the state-free parallel pass.
 
         // --- MEMW and LOAD (require state tracking, order matters) ---
 
@@ -474,41 +538,13 @@ fn collect_ops_from_cpu(
             ec_scalar_ops.extend(ec_scalar_rows);
             ecdas_ops.extend(ecdas_rows);
         }
-
-        // --- ALU chip dispatch (no state tracking) ---
-        // Word (`*W`) instructions are delegated to CPU32 (which itself drives
-        // the ALU chips); the main CPU does not send the ALU bus for them, so we
-        // must not emit chip ops here. CPU32 op-generation is B5b.
-        let f = op.decode.fields;
-        if !f.word_instr {
-            // LT: SLT / BLT / BGE, dispatched on the unified ALU bus. `invert`
-            // (BGE/BGEU) is applied inside the LT chip (`out = lt XOR invert`).
-            if f.is_lt() {
-                lt_ops.push(LtOperation::new_with_invert(
-                    op.rv1,
-                    op.arg2,
-                    f.alu_signed(),
-                    f.alu_signed2_or_invert(),
-                ));
-            }
-            // SHIFT: SLL/SRL/SRA. direction = invert bit (0 = left, 1 = right).
-            // The full arg2 goes on the ALU bus as in2; the chip uses its low
-            // byte for the (mod 32/64) computation.
-            if f.is_shift() {
-                shift_ops.push(ShiftOperation::new(
-                    op.rv1,
-                    op.arg2,
-                    f.alu_signed2_or_invert(),
-                    f.alu_signed(),
-                    f.word_instr,
-                ));
-            }
-        }
-
-        // Collect CPU range-check bitwise lookups (ARE_BYTES + IS_HALF).
-        bitwise_ops.extend(op.collect_bitwise_ops());
     }
 
+    // CPU range-check lookups (ARE_BYTES + IS_HALF) were collected in the
+    // state-free pass above; merge them in. Order is irrelevant for the bitwise
+    // multiplicity accumulation.
+    bitwise_ops.extend(cpu_bitwise_ops);
+
     // Each ecall generates count+1 operations (count real rows + 1 end row)
     debug_assert_eq!(
         commit_ops.len(),