From 3868dd832e48119fcc69d30ce78bc73099b2974d Mon Sep 17 00:00:00 2001 From: diegokingston Date: Mon, 22 Jun 2026 19:21:53 -0300 Subject: [PATCH 1/4] perf(stark): fuse composition half-extension onto coset_lde_full MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit decompose_and_extend_d2's extend_half_to_lde did iFFT(g²) → coefficient Polynomial → evaluate_polynomial_on_lde_domain(g) as two separate FFTs with an intermediate coefficient allocation per half. Replace with a single fused coset_lde_full: iFFT(n) → coset reshift g²→g → forward FFT(2n=lde_size). The weights (g⁻ʲ/n, folding the 1/n iFFT normalization and the net g²→g shift) and the inverse twiddles (size lde_size/2) are precomputed once per domain in LdeTwiddles (the forward FFT reuses the existing fwd twiddles), and threaded through prove_rounds_2_to_4 → round_2 → decompose_and_extend_d2 — no per-call recomputation. This path is now production (degree-3 tables use the 2-part decompose_and_extend_d2 after #699). Byte-identical: test_decompose_and_extend_d2_matches_original (decompose output == original break_in_parts path), a new formula test, stark 130/130, real VM proof (fib_iterative_1200k) prove+verify OK, clippy + fmt clean. --- crypto/stark/src/prover.rs | 76 ++++++++++++++++++-------- crypto/stark/src/tests/prover_tests.rs | 39 ++++++++++++- 2 files changed, 92 insertions(+), 23 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 4da57559c..1a4b1c810 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -269,11 +269,16 @@ pub(crate) struct LdeTwiddles { inv: LayerTwiddles, fwd: LayerTwiddles, coset_weights: Vec>, + /// Composition half-extension (`decompose_and_extend_d2`): inverse twiddles for + /// the g²-coset halves of size `lde_size/2`, and weights `g⁻ʲ/(lde_size/2)`. The + /// forward FFT reuses `fwd` (size `lde_size`). + comp_inv: LayerTwiddles, + comp_weights: Vec>, } impl LdeTwiddles { /// Construct twiddles and coset weights for a domain of the given size and blowup factor. - fn new(domain: &Domain) -> Self { + pub(crate) fn new(domain: &Domain) -> Self { let domain_size = domain.interpolation_domain_size; let lde_size = domain_size * domain.blowup_factor; @@ -291,12 +296,34 @@ impl LdeTwiddles { w }; + // Composition half-extension weights: g⁻ʲ / (lde_size/2). The constraint- + // quotient halves live on the g²-coset of size `lde_size/2`; the unnormalized + // iFFT yields `n·cⱼ·(g²)ʲ` and these weights turn that into `cⱼ·gʲ` for the + // forward FFT onto the g-coset. + let half_size = lde_size / 2; + let half_size_inv = FieldElement::::from(half_size as u64) + .inv() + .expect("half_size is power of two"); + let offset_inv = offset.inv().expect("coset offset is non-zero"); + let comp_weights = { + let mut w = Vec::with_capacity(half_size); + let mut cur = half_size_inv; + for _ in 0..half_size { + w.push(cur.clone()); + cur = &cur * &offset_inv; + } + w + }; + Self { inv: LayerTwiddles::::new_inverse(domain_size.trailing_zeros() as u64) .expect("valid inverse twiddles"), fwd: LayerTwiddles::::new(lde_size.trailing_zeros() as u64) .expect("valid forward twiddles"), coset_weights, + comp_inv: LayerTwiddles::::new_inverse(half_size.trailing_zeros() as u64) + .expect("valid composition inverse twiddles"), + comp_weights, } } } @@ -904,6 +931,7 @@ pub trait IsStarkProver< fn decompose_and_extend_d2( constraint_evaluations: &[FieldElement], domain: &Domain, + twiddles: &LdeTwiddles, ) -> Vec>> where FieldElement: AsBytes + Sync + Send, @@ -934,9 +962,8 @@ pub trait IsStarkProver< (&two_inv * &sum, &inv_2x[i] * &diff) }); - // Step 3: Extend each part from N evals on g²-coset to 2N evals on g-coset. - // The squared coset offset is g² (= coset_offset²). - let coset_offset_squared = &domain.coset_offset * &domain.coset_offset; + // Step 3: Extend each part from n evals on the g²-coset to 2n evals on the + // g-coset (the full LDE domain). // GPU fast path: batch both halves into one ext3 LDE call. Requires // `cuda` feature and a qualifying size. Falls through to CPU when not. @@ -948,35 +975,36 @@ pub trait IsStarkProver< } let (lde_h0, lde_h1) = crate::par::join( - || Self::extend_half_to_lde(&h0_evals, &coset_offset_squared, domain), - || Self::extend_half_to_lde(&h1_evals, &coset_offset_squared, domain), + || Self::extend_half_to_lde(&h0_evals, twiddles), + || Self::extend_half_to_lde(&h1_evals, twiddles), ); vec![lde_h0, lde_h1] } - /// Given N evaluations of a degree-], - squared_offset: &FieldElement, - domain: &Domain, + twiddles: &LdeTwiddles, ) -> Vec> where FieldElement: AsBytes, FieldElement: AsBytes, { - // iFFT on the N-point squared coset to get coefficients - let poly = Polynomial::interpolate_offset_fft(half_evals, squared_offset) - .expect("iFFT should succeed"); - // Evaluate on the full LDE domain (2N points on the g-coset) - evaluate_polynomial_on_lde_domain( - &poly, - domain.blowup_factor, - domain.interpolation_domain_size, - &domain.coset_offset, + debug_assert_eq!(half_evals.len(), twiddles.comp_weights.len()); + Polynomial::coset_lde_full::( + half_evals, + 2, + &twiddles.comp_weights, + &twiddles.comp_inv, + &twiddles.fwd, ) - .expect("LDE evaluation should succeed") + .expect("coset extension") } /// Returns the result of the second round of the STARK Prove protocol. @@ -984,6 +1012,7 @@ pub trait IsStarkProver< air: &dyn AIR, pub_inputs: &PI, domain: &Domain, + twiddles: &LdeTwiddles, round_1_result: &Round1, transition_coefficients: &[FieldElement], boundary_coefficients: &[FieldElement], @@ -1026,7 +1055,7 @@ pub trait IsStarkProver< // H₀(x²) = (H(x) + H(-x)) / 2 // H₁(x²) = (H(x) - H(-x)) / (2x) // On the LDE coset {g·ω^i}, we have -g·ω^i = g·ω^{i+N} since ω^N = -1. - Self::decompose_and_extend_d2(&constraint_evaluations, domain) + Self::decompose_and_extend_d2(&constraint_evaluations, domain, twiddles) } else if number_of_parts == 1 { // Degree bound equals trace length: constraint evals are the LDE directly. vec![constraint_evaluations] @@ -2141,6 +2170,7 @@ pub trait IsStarkProver< &round_1_result, table_transcript, domain, + &twiddle_caches[idx], )?; #[cfg(feature = "instruments")] @@ -2228,6 +2258,7 @@ pub trait IsStarkProver< round_1_result: &Round1, transcript: &mut (impl IsStarkTranscript + Clone), domain: &Domain, + twiddles: &LdeTwiddles, ) -> Result, ProvingError> where FieldElement: AsBytes, @@ -2268,6 +2299,7 @@ pub trait IsStarkProver< air, pub_inputs, domain, + twiddles, round_1_result, &transition_coefficients, &boundary_coefficients, diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs index c645eebb2..204c54651 100644 --- a/crypto/stark/src/tests/prover_tests.rs +++ b/crypto/stark/src/tests/prover_tests.rs @@ -7,7 +7,7 @@ use crate::{ simple_fibonacci::{self, FibonacciAIR, FibonacciPublicInputs}, }, proof::options::ProofOptions, - prover::{IsStarkProver, Prover, evaluate_polynomial_on_lde_domain}, + prover::{IsStarkProver, LdeTwiddles, Prover, evaluate_polynomial_on_lde_domain}, test_utils::multi_prove_ram, tests::domain_cache_stats, trace::{LDETraceTable, get_trace_evaluations, get_trace_evaluations_from_lde}, @@ -21,6 +21,42 @@ use math::{ type Felt = FieldElement; +/// The fused composition half-extension (`extend_half_to_lde`) must produce exactly +/// the same g-coset evaluations as the reference it replaces: iFFT on the g²-coset → +/// coefficients → evaluate on the g-coset LDE. Both yield the unique degree-` = (0..n).map(|i| Felt::from((i as u64) * 7 + 1)).collect(); + + // Reference: iFFT(g²) → coeffs → evaluate on the g-coset of size 2n. + let poly = Polynomial::interpolate_offset_fft(&half, &g2).unwrap(); + let reference = evaluate_polynomial_on_lde_domain(&poly, 2, n, &g).unwrap(); + + // Fused: coset_lde_full with weights wⱼ = g⁻ʲ / n. + let n_inv = Felt::from(n as u64).inv().unwrap(); + let g_inv = g.inv().unwrap(); + let mut weights = Vec::with_capacity(n); + let mut w = n_inv; + for _ in 0..n { + weights.push(w.clone()); + w = &w * &g_inv; + } + let inv = LayerTwiddles::::new_inverse(n.trailing_zeros() as u64).unwrap(); + let fwd = LayerTwiddles::::new((2 * n).trailing_zeros() as u64).unwrap(); + let fused = Polynomial::coset_lde_full::(&half, 2, &weights, &inv, &fwd).unwrap(); + + assert_eq!(reference, fused, "mismatch at n={n}"); + } +} + #[test] fn test_domain_constructor() { let trace = simple_fibonacci::fibonacci_trace([Felt::from(1), Felt::from(1)], 8); @@ -234,6 +270,7 @@ fn test_decompose_and_extend_d2_matches_original() { let new_result = Prover::::decompose_and_extend_d2( &constraint_evaluations, &domain, + &LdeTwiddles::new(&domain), ); assert_eq!(new_result.len(), 2); From 536466f0acc737cd1cced62cd4c9597223d78692 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Tue, 23 Jun 2026 15:53:01 -0300 Subject: [PATCH 2/4] fix(stark): drop clone_on_copy in composition extend test (clippy -D warnings) --- crypto/stark/src/tests/prover_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs index 204c54651..a5468b522 100644 --- a/crypto/stark/src/tests/prover_tests.rs +++ b/crypto/stark/src/tests/prover_tests.rs @@ -46,7 +46,7 @@ fn composition_extend_half_fused_matches_reference() { let mut weights = Vec::with_capacity(n); let mut w = n_inv; for _ in 0..n { - weights.push(w.clone()); + weights.push(w); w = &w * &g_inv; } let inv = LayerTwiddles::::new_inverse(n.trailing_zeros() as u64).unwrap(); From 3951ae2fb366d5559f53252bb1b2c6ec7c23b291 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 26 Jun 2026 16:59:39 -0300 Subject: [PATCH 3/4] fix(stark): keep composition LDE twiddles in release builds --- crypto/stark/src/prover.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 97b28f78c..b99dac5db 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -5,7 +5,6 @@ use std::time::{Duration, Instant}; use crypto::fiat_shamir::is_transcript::IsStarkTranscript; use math::fft::bit_reversing::{in_place_bit_reverse_permute, reverse_index}; -#[cfg(any(test, feature = "test-utils", feature = "debug-checks"))] use math::fft::bowers_fft::LayerTwiddles; use math::fft::errors::FFTError; use math::fft::two_half_fft::TwoHalfTwiddles; @@ -293,9 +292,10 @@ pub(crate) struct LdeTwiddles { two_half_fwd: TwoHalfTwiddles, coset_weights: Vec>, /// Composition half-extension (`decompose_and_extend_d2`): inverse twiddles for - /// the g²-coset halves of size `lde_size/2`, and weights `g⁻ʲ/(lde_size/2)`. The - /// forward FFT reuses `fwd` (size `lde_size`). + /// the g²-coset halves of size `lde_size/2`, forward twiddles for the full + /// g-coset of size `lde_size`, and weights `g⁻ʲ/(lde_size/2)`. comp_inv: LayerTwiddles, + comp_fwd: LayerTwiddles, comp_weights: Vec>, } @@ -324,10 +324,12 @@ impl LdeTwiddles { // iFFT yields `n·cⱼ·(g²)ʲ` and these weights turn that into `cⱼ·gʲ` for the // forward FFT onto the g-coset. let half_size = lde_size / 2; - let half_size_inv = FieldElement::::from(half_size as u64) + let half_size_fe = FieldElement::::from(half_size as u64); + let inv_half_size_offset = (&half_size_fe * offset) .inv() - .expect("half_size is power of two"); - let offset_inv = offset.inv().expect("coset offset is non-zero"); + .expect("half_size and coset offset are non-zero"); + let half_size_inv = offset * &inv_half_size_offset; + let offset_inv = &half_size_fe * &inv_half_size_offset; let comp_weights = { let mut w = Vec::with_capacity(half_size); let mut cur = half_size_inv; @@ -352,6 +354,8 @@ impl LdeTwiddles { coset_weights, comp_inv: LayerTwiddles::::new_inverse(half_size.trailing_zeros() as u64) .expect("valid composition inverse twiddles"), + comp_fwd: LayerTwiddles::::new(lde_size.trailing_zeros() as u64) + .expect("valid composition forward twiddles"), comp_weights, } } @@ -1218,7 +1222,7 @@ pub trait IsStarkProver< 2, &twiddles.comp_weights, &twiddles.comp_inv, - &twiddles.fwd, + &twiddles.comp_fwd, ) .expect("coset extension") } From 7e9b048087a1143e7ac47b1b182c052ce1b062cd Mon Sep 17 00:00:00 2001 From: MauroFab Date: Fri, 26 Jun 2026 17:20:01 -0300 Subject: [PATCH 4/4] fix(stark): lazy composition LDE twiddle cache --- crypto/stark/src/prover.rs | 111 +++++++++++++++---------- crypto/stark/src/tests/prover_tests.rs | 6 +- 2 files changed, 74 insertions(+), 43 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index b99dac5db..eed0e512a 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -1,5 +1,5 @@ use std::marker::PhantomData; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[cfg(feature = "instruments")] use std::time::{Duration, Instant}; @@ -291,12 +291,50 @@ pub(crate) struct LdeTwiddles { two_half_inv: TwoHalfTwiddles, two_half_fwd: TwoHalfTwiddles, coset_weights: Vec>, - /// Composition half-extension (`decompose_and_extend_d2`): inverse twiddles for - /// the g²-coset halves of size `lde_size/2`, forward twiddles for the full - /// g-coset of size `lde_size`, and weights `g⁻ʲ/(lde_size/2)`. - comp_inv: LayerTwiddles, - comp_fwd: LayerTwiddles, - comp_weights: Vec>, + /// Composition half-extension cache, initialized only when the degree-2 + /// decomposition path actually runs on CPU. + composition: OnceLock>, +} + +pub(crate) struct CompositionLdeTwiddles { + /// Inverse twiddles for the g²-coset halves of size `lde_size/2`. + inv: LayerTwiddles, + /// Forward twiddles for the full g-coset of size `lde_size`. + fwd: LayerTwiddles, + /// Weights `g⁻ʲ/(lde_size/2)` for the composition half-extension. + weights: Vec>, +} + +impl CompositionLdeTwiddles { + fn new(half_size: usize, offset: &FieldElement) -> Self { + // Composition half-extension weights: g⁻ʲ / half_size. The constraint- + // quotient halves live on the g²-coset of size `half_size`; the unnormalized + // iFFT yields `n·cⱼ·(g²)ʲ` and these weights turn that into `cⱼ·gʲ` for the + // forward FFT onto the g-coset. + let half_size_fe = FieldElement::::from(half_size as u64); + let inv_half_size_offset = (&half_size_fe * offset) + .inv() + .expect("half_size and coset offset are non-zero"); + let half_size_inv = offset * &inv_half_size_offset; + let offset_inv = &half_size_fe * &inv_half_size_offset; + let weights = { + let mut w = Vec::with_capacity(half_size); + let mut cur = half_size_inv; + for _ in 0..half_size { + w.push(cur.clone()); + cur = &cur * &offset_inv; + } + w + }; + + Self { + inv: LayerTwiddles::::new_inverse(half_size.trailing_zeros() as u64) + .expect("valid composition inverse twiddles"), + fwd: LayerTwiddles::::new((half_size * 2).trailing_zeros() as u64) + .expect("valid composition forward twiddles"), + weights, + } + } } impl LdeTwiddles { @@ -319,27 +357,6 @@ impl LdeTwiddles { w }; - // Composition half-extension weights: g⁻ʲ / (lde_size/2). The constraint- - // quotient halves live on the g²-coset of size `lde_size/2`; the unnormalized - // iFFT yields `n·cⱼ·(g²)ʲ` and these weights turn that into `cⱼ·gʲ` for the - // forward FFT onto the g-coset. - let half_size = lde_size / 2; - let half_size_fe = FieldElement::::from(half_size as u64); - let inv_half_size_offset = (&half_size_fe * offset) - .inv() - .expect("half_size and coset offset are non-zero"); - let half_size_inv = offset * &inv_half_size_offset; - let offset_inv = &half_size_fe * &inv_half_size_offset; - let comp_weights = { - let mut w = Vec::with_capacity(half_size); - let mut cur = half_size_inv; - for _ in 0..half_size { - w.push(cur.clone()); - cur = &cur * &offset_inv; - } - w - }; - Self { #[cfg(any(test, feature = "test-utils", feature = "debug-checks"))] inv: LayerTwiddles::::new_inverse(domain_size.trailing_zeros() as u64) @@ -352,13 +369,22 @@ impl LdeTwiddles { two_half_fwd: TwoHalfTwiddles::::new(lde_size.trailing_zeros() as usize, false) .expect("valid forward two-half twiddles"), coset_weights, - comp_inv: LayerTwiddles::::new_inverse(half_size.trailing_zeros() as u64) - .expect("valid composition inverse twiddles"), - comp_fwd: LayerTwiddles::::new(lde_size.trailing_zeros() as u64) - .expect("valid composition forward twiddles"), - comp_weights, + composition: OnceLock::new(), } } + + fn composition(&self, domain: &Domain) -> &CompositionLdeTwiddles { + let lde_size = domain.interpolation_domain_size * domain.blowup_factor; + let half_size = lde_size / 2; + debug_assert_eq!(self.coset_weights.len(), domain.interpolation_domain_size); + self.composition + .get_or_init(|| CompositionLdeTwiddles::new(half_size, &domain.coset_offset)) + } + + #[cfg(test)] + pub(crate) fn has_composition_cache(&self) -> bool { + self.composition.get().is_some() + } } /// Number of tables to process concurrently in `multi_prove`. @@ -1194,9 +1220,10 @@ pub trait IsStarkProver< return vec![lde_h0, lde_h1]; } + let composition_twiddles = twiddles.composition(domain); let (lde_h0, lde_h1) = crate::par::join( - || Self::extend_half_to_lde(&h0_evals, twiddles), - || Self::extend_half_to_lde(&h1_evals, twiddles), + || Self::extend_half_to_lde(&h0_evals, composition_twiddles), + || Self::extend_half_to_lde(&h1_evals, composition_twiddles), ); vec![lde_h0, lde_h1] } @@ -1206,23 +1233,23 @@ pub trait IsStarkProver< /// /// Fused: iFFT(n) → coset reshift g²→g → forward FFT(2n) in a single pass with no /// intermediate coefficient `Polynomial`. The twiddles and the weights `g⁻ʲ/n` - /// (which fold the 1/n normalization and the net g²→g shift) are precomputed once - /// per domain in [`LdeTwiddles`]. + /// (which fold the 1/n normalization and the net g²→g shift) are cached lazily + /// once per domain in [`LdeTwiddles`]. fn extend_half_to_lde( half_evals: &[FieldElement], - twiddles: &LdeTwiddles, + twiddles: &CompositionLdeTwiddles, ) -> Vec> where FieldElement: AsBytes, FieldElement: AsBytes, { - debug_assert_eq!(half_evals.len(), twiddles.comp_weights.len()); + debug_assert_eq!(half_evals.len(), twiddles.weights.len()); Polynomial::coset_lde_full::( half_evals, 2, - &twiddles.comp_weights, - &twiddles.comp_inv, - &twiddles.comp_fwd, + &twiddles.weights, + &twiddles.inv, + &twiddles.fwd, ) .expect("coset extension") } diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs index ca6edebc2..318dacb81 100644 --- a/crypto/stark/src/tests/prover_tests.rs +++ b/crypto/stark/src/tests/prover_tests.rs @@ -268,11 +268,15 @@ fn test_decompose_and_extend_d2_matches_original() { .collect(); // --- New path: algebraic decomposition --- + let twiddles = LdeTwiddles::new(&domain); + assert!(!twiddles.has_composition_cache()); let new_result = Prover::::decompose_and_extend_d2( &constraint_evaluations, &domain, - &LdeTwiddles::new(&domain), + &twiddles, ); + #[cfg(not(feature = "cuda"))] + assert!(twiddles.has_composition_cache()); assert_eq!(new_result.len(), 2); assert_eq!(new_result[0].len(), original[0].len());