From a3f58fcc7f541f59bd6978b088fbba3459c7d55b Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Tue, 21 Apr 2026 21:33:17 +0000 Subject: [PATCH 01/10] sequential column reads in commit_columns_bit_reversed Read columns at natural index k inside the parallel hashing loop, then apply in_place_bit_reverse_permute to the Commitment vector before building the Merkle tree. Same leaves as reading at br(row_idx) inside the loop; replaces scattered column reads (~2GB volume on MEMW_R) with sequential reads plus a 64MB in-place bit-reverse pass. --- crypto/stark/src/prover.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 41ccb8366..5fa7737ef 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -361,10 +361,11 @@ pub trait IsStarkProver< /// Builds a Merkle tree commitment from column-major LDE evaluations with /// bit-reverse permutation, without cloning the full evaluation matrix. /// - /// For each row index `i`, we hash `col_0[br(i)] || col_1[br(i)] || ...` - /// where `br(i)` is the bit-reversal of `i`. This produces the same Merkle - /// tree as the old clone + bit-reverse + columns2rows + batch_commit flow, - /// but avoids allocating the cloned and transposed matrices entirely. + /// Hashes `col_0[k] || col_1[k] || ...` for k = 0..num_rows (sequential column + /// reads, cache-friendly), then permutes the hash vector in bit-reversed order + /// so leaves[i] = hash(col_0[br(i)] || col_1[br(i)] || ...). Same Merkle tree + /// as reading at br(row_idx) inside the hashing loop, but the scattered column + /// access is replaced by a single small bit-reverse pass over 32-byte digests. fn commit_columns_bit_reversed( columns: &[Vec>], ) -> Option<(BatchedMerkleTree, Commitment)> @@ -392,21 +393,20 @@ pub trait IsStarkProver< #[cfg(not(feature = "parallel"))] let iter = 0..num_rows; - // One allocation per row (was one per field element): write all columns - // into a single buffer, then hash once. - let hashed_leaves: Vec = iter - .map(|row_idx| { - let br_idx = reverse_index(row_idx, num_rows as u64); + let mut hashed_leaves: Vec = iter + .map(|k| { let total_bytes = num_cols * byte_len; let mut buf = vec![0u8; total_bytes]; for col_idx in 0..num_cols { - columns[col_idx][br_idx] + columns[col_idx][k] .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]); } BatchedMerkleTreeBackend::::hash_bytes(&buf) }) .collect(); + in_place_bit_reverse_permute(&mut hashed_leaves); + let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; let root = tree.root; Some((tree, root)) From ffcb5729ae296a5c185fa12fc7068d007e8b0dc4 Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Wed, 22 Apr 2026 19:40:40 +0000 Subject: [PATCH 02/10] skip redundant bit-reverse pair in R4 deep-composition LDE round_4 called evaluate_fft (which internally permutes the FFT output to natural order) and then in_place_bit_reverse_permute on the result to flip it back. Both permutes cancel. FRI commit_phase_from_evaluations pairs evals as chunks_exact(2) expecting {f(x), f(-x)} adjacency, which is exactly the bit-reversed output of the Bowers forward FFT. Added Polynomial::evaluate_fft_bit_reversed that skips the final permute, and called it from round_4. Result: two ~24ms permutes (at 2N=4M per table) eliminated per prove. fib_iterative_2M on Linux x86_64, 12 cores, 5 samples: - prove wall-clock: 75.4s -> 74.4s median (-1.3%), 75.5s -> 74.3s mean (-1.6%) - R4 interpolate+evaluate_fft sub-phase: 2.73s -> 1.95s (-29%) - CV 0.6% (2xCV=1.2% threshold, 1.3% improvement clears it) - Verification against baseline binary: PASS --- crypto/math/src/fft/polynomial.rs | 31 +++++++++++++++++++++++++++++++ crypto/stark/src/prover.rs | 11 +++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index 9157903fd..129473207 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -80,6 +80,37 @@ impl Polynomial> { evaluate_fft_cpu::(&coeffs) } + /// Same as `evaluate_fft` but returns the evaluations in bit-reversed order, + /// skipping the final natural-order permutation. Use when the consumer expects + /// bit-reversed input (e.g. FRI commit phase, which pairs consecutive values as + /// {f(x), f(-x)}). + pub fn evaluate_fft_bit_reversed>( + poly: &Polynomial>, + blowup_factor: usize, + domain_size: Option, + ) -> Result>, FFTError> + where + E: Send + Sync, + { + let domain_size = domain_size.unwrap_or(0); + let len = core::cmp::max(poly.coeff_len(), domain_size).next_power_of_two() * blowup_factor; + if len.trailing_zeros() as u64 > F::TWO_ADICITY { + return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize)); + } + if poly.coefficients().is_empty() { + return Ok(vec![FieldElement::zero(); len]); + } + + let mut coeffs = poly.coefficients().to_vec(); + coeffs.resize(len, FieldElement::zero()); + + let order = len.trailing_zeros() as u64; + let layer_twiddles = + LayerTwiddles::::new(order).ok_or(FFTError::DomainSizeError(order as usize))?; + dispatch_fft(&mut coeffs, &layer_twiddles)?; + Ok(coeffs) + } + /// Returns `N` evaluations with an offset of this polynomial using FFT over a domain in a subfield F of E /// (so the results are P(w^i), with w being a primitive root of unity). /// `N = max(self.coeff_len(), domain_size).next_power_of_two() * blowup_factor`. diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 5fa7737ef..759ffac93 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use std::time::Instant; use crypto::fiat_shamir::is_transcript::IsStarkTranscript; -use math::fft::cpu::bit_reversing::{in_place_bit_reverse_permute, reverse_index}; +use math::fft::cpu::bit_reversing::reverse_index; use math::fft::cpu::bowers_fft::LayerTwiddles; use math::fft::errors::FFTError; @@ -1081,9 +1081,12 @@ pub trait IsStarkProver< let t_sub = Instant::now(); let deep_poly = Polynomial::interpolate_fft::(&deep_evals).expect("iFFT should succeed"); - let mut lde_evals = Polynomial::evaluate_fft::(&deep_poly, 1, Some(domain_size)) - .expect("FFT should succeed"); - in_place_bit_reverse_permute(&mut lde_evals); + // FRI commit_phase consumes bit-reversed evaluations natively. Request them + // directly from evaluate_fft_bit_reversed to avoid a pair of redundant permutes + // (evaluate_fft's internal natural-order permute + an external re-bit-reverse). + let lde_evals = + Polynomial::evaluate_fft_bit_reversed::(&deep_poly, 1, Some(domain_size)) + .expect("FFT should succeed"); #[cfg(feature = "instruments")] let r4_fft_dur = t_sub.elapsed(); From 95553853210a6cb6e2bf419031dd146c7274aafc Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Wed, 22 Apr 2026 21:35:15 +0000 Subject: [PATCH 03/10] import in_place_bit_reverse_permute in prover --- crypto/stark/src/prover.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 759ffac93..d418f7773 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use std::time::Instant; use crypto::fiat_shamir::is_transcript::IsStarkTranscript; -use math::fft::cpu::bit_reversing::reverse_index; +use math::fft::cpu::bit_reversing::{in_place_bit_reverse_permute, reverse_index}; use math::fft::cpu::bowers_fft::LayerTwiddles; use math::fft::errors::FFTError; From 74587c6abf363bb1bc5d507bff0bb896f7d1ab2a Mon Sep 17 00:00:00 2001 From: gabrielbosio Date: Thu, 23 Apr 2026 16:56:35 -0300 Subject: [PATCH 04/10] guard non-power-of-two len in evaluate_fft_bit_reversed --- crypto/math/src/fft/polynomial.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index 129473207..2d5702b58 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -101,6 +101,10 @@ impl Polynomial> { return Ok(vec![FieldElement::zero(); len]); } + if !len.is_power_of_two() { + return Err(FFTError::InputError(len)); + } + let mut coeffs = poly.coefficients().to_vec(); coeffs.resize(len, FieldElement::zero()); From b6f798bf0254af3a8426c799912b9a10029921e2 Mon Sep 17 00:00:00 2001 From: gabrielbosio Date: Thu, 23 Apr 2026 16:56:35 -0300 Subject: [PATCH 05/10] test evaluate_fft_bit_reversed invariants --- crypto/math/src/fft/polynomial.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index 2d5702b58..cd3f6f958 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -549,4 +549,34 @@ mod tests { assert_eq!(reference, buffer, "Mismatch for seed {}", seed); } } + + #[test] + fn evaluate_fft_bit_reversed_matches_evaluate_fft_then_permute() { + for order in 1..=8 { + let n = 1usize << order; + let coeffs: Vec = (0..n).map(|i| FE::from((i * 17 + 3) as u64)).collect(); + let poly = Polynomial::new(&coeffs); + + for blowup_factor in [1, 2, 4] { + let size = Some(n); + let mut expected = + Polynomial::evaluate_fft::(&poly, blowup_factor, size).unwrap(); + in_place_bit_reverse_permute(&mut expected); + + let got = + Polynomial::evaluate_fft_bit_reversed::(&poly, blowup_factor, size).unwrap(); + + assert_eq!(got, expected, "order={order}, blowup={blowup_factor}"); + } + } + } + + #[test] + fn evaluate_fft_bit_reversed_rejects_non_power_of_two_blowup() { + let coeffs: Vec = (0..8).map(|i| FE::from(i as u64)).collect(); + let poly = Polynomial::new(&coeffs); + + let err = Polynomial::evaluate_fft_bit_reversed::(&poly, 3, Some(8)); + assert!(matches!(err, Err(FFTError::InputError(_)))); + } } From 826684645863600ad4a4eb98e2a126d92b3340bd Mon Sep 17 00:00:00 2001 From: gabrielbosio Date: Thu, 23 Apr 2026 17:10:06 -0300 Subject: [PATCH 06/10] hoist non-power-of-two guard before empty check --- crypto/math/src/fft/polynomial.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index cd3f6f958..c13ea9a93 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -97,14 +97,14 @@ impl Polynomial> { if len.trailing_zeros() as u64 > F::TWO_ADICITY { return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize)); } - if poly.coefficients().is_empty() { - return Ok(vec![FieldElement::zero(); len]); - } - if !len.is_power_of_two() { return Err(FFTError::InputError(len)); } + if poly.coefficients().is_empty() { + return Ok(vec![FieldElement::zero(); len]); + } + let mut coeffs = poly.coefficients().to_vec(); coeffs.resize(len, FieldElement::zero()); @@ -578,5 +578,9 @@ mod tests { let err = Polynomial::evaluate_fft_bit_reversed::(&poly, 3, Some(8)); assert!(matches!(err, Err(FFTError::InputError(_)))); + + let empty = Polynomial::::new(&[]); + let err = Polynomial::evaluate_fft_bit_reversed::(&empty, 3, None); + assert!(matches!(err, Err(FFTError::InputError(_)))); } } From 1fe9a220bb79e7f6f9b0b25928ab57728909e88c Mon Sep 17 00:00:00 2001 From: gabrielbosio Date: Thu, 23 Apr 2026 18:23:10 -0300 Subject: [PATCH 07/10] share FFT implementation via evaluate_fft_cpu_inner --- crypto/math/src/fft/polynomial.rs | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index c13ea9a93..bd834fbb9 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -107,12 +107,7 @@ impl Polynomial> { let mut coeffs = poly.coefficients().to_vec(); coeffs.resize(len, FieldElement::zero()); - - let order = len.trailing_zeros() as u64; - let layer_twiddles = - LayerTwiddles::::new(order).ok_or(FFTError::DomainSizeError(order as usize))?; - dispatch_fft(&mut coeffs, &layer_twiddles)?; - Ok(coeffs) + evaluate_fft_cpu_inner::(coeffs, false) } /// Returns `N` evaluations with an offset of this polynomial using FFT over a domain in a subfield F of E @@ -313,7 +308,10 @@ where Polynomial::interpolate_fft::(values.as_slice()).unwrap() } -pub fn evaluate_fft_cpu(coeffs: &[FieldElement]) -> Result>, FFTError> +fn evaluate_fft_cpu_inner( + mut coeffs: Vec>, + permute_after: bool, +) -> Result>, FFTError> where F: IsFFTField + IsSubFieldOf, E: IsField + Send + Sync, @@ -326,10 +324,19 @@ where let layer_twiddles = LayerTwiddles::::new(order).ok_or(FFTError::DomainSizeError(order as usize))?; - let mut result = coeffs.to_vec(); - dispatch_fft(&mut result, &layer_twiddles)?; - in_place_bit_reverse_permute(&mut result); - Ok(result) + dispatch_fft(&mut coeffs, &layer_twiddles)?; + if permute_after { + in_place_bit_reverse_permute(&mut coeffs); + } + Ok(coeffs) +} + +pub fn evaluate_fft_cpu(coeffs: &[FieldElement]) -> Result>, FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, +{ + evaluate_fft_cpu_inner::(coeffs.to_vec(), true) } pub fn interpolate_fft_cpu( From d7b0dd3b17a9f1c647c9aaacc7ba7b73758af760 Mon Sep 17 00:00:00 2001 From: gabrielbosio Date: Thu, 23 Apr 2026 18:23:12 -0300 Subject: [PATCH 08/10] note stricter validation in evaluate_fft_bit_reversed docs --- crypto/math/src/fft/polynomial.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index bd834fbb9..ecdaa44b5 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -84,6 +84,10 @@ impl Polynomial> { /// skipping the final natural-order permutation. Use when the consumer expects /// bit-reversed input (e.g. FRI commit phase, which pairs consecutive values as /// {f(x), f(-x)}). + /// + /// Stricter validation than `evaluate_fft`: non-power-of-two `len` is rejected + /// even for empty polynomials. `evaluate_fft`'s guard lives inside + /// `evaluate_fft_cpu`, which its empty-polynomial fast path skips. pub fn evaluate_fft_bit_reversed>( poly: &Polynomial>, blowup_factor: usize, From d2cb6eaabc43415145269a7f416fbdf8a9c9f3f8 Mon Sep 17 00:00:00 2001 From: gabrielbosio Date: Thu, 23 Apr 2026 19:09:40 -0300 Subject: [PATCH 09/10] cover zero-padding case in evaluate_fft_bit_reversed test --- crypto/math/src/fft/polynomial.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index ecdaa44b5..8b90cbd1b 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -569,15 +569,21 @@ mod tests { let poly = Polynomial::new(&coeffs); for blowup_factor in [1, 2, 4] { - let size = Some(n); - let mut expected = - Polynomial::evaluate_fft::(&poly, blowup_factor, size).unwrap(); - in_place_bit_reverse_permute(&mut expected); - - let got = - Polynomial::evaluate_fft_bit_reversed::(&poly, blowup_factor, size).unwrap(); - - assert_eq!(got, expected, "order={order}, blowup={blowup_factor}"); + for domain_mult in [1, 2] { + let size = Some(n * domain_mult); + let mut expected = + Polynomial::evaluate_fft::(&poly, blowup_factor, size).unwrap(); + in_place_bit_reverse_permute(&mut expected); + + let got = + Polynomial::evaluate_fft_bit_reversed::(&poly, blowup_factor, size) + .unwrap(); + + assert_eq!( + got, expected, + "order={order}, blowup={blowup_factor}, domain_mult={domain_mult}" + ); + } } } } From 8c4b9c75309df2ed46d57cfb3b65289e5bbe82d0 Mon Sep 17 00:00:00 2001 From: gabrielbosio Date: Thu, 23 Apr 2026 19:09:41 -0300 Subject: [PATCH 10/10] test commit_columns_bit_reversed matches naive reference --- crypto/stark/src/tests/prover_tests.rs | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/crypto/stark/src/tests/prover_tests.rs b/crypto/stark/src/tests/prover_tests.rs index 55d58da7e..d0e7fe8d1 100644 --- a/crypto/stark/src/tests/prover_tests.rs +++ b/crypto/stark/src/tests/prover_tests.rs @@ -233,3 +233,40 @@ fn test_decompose_and_extend_d2_matches_original() { assert_eq!(new_result[1][i], original[1][i], "H₁ mismatch at index {i}"); } } + +#[test] +fn commit_columns_bit_reversed_matches_naive_reference() { + use crate::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; + use math::fft::cpu::bit_reversing::reverse_index; + use math::traits::ByteConversion; + + let num_rows = 8usize; + let num_cols = 3usize; + let columns: Vec> = (0..num_cols) + .map(|c| { + (0..num_rows) + .map(|r| Felt::from((c * num_rows + r + 1) as u64)) + .collect() + }) + .collect(); + + let byte_len = ::BYTE_LEN; + let reference_leaves: Vec = (0..num_rows) + .map(|i| { + let br_i = reverse_index(i, num_rows as u64); + let mut buf = vec![0u8; num_cols * byte_len]; + for c in 0..num_cols { + columns[c][br_i].write_bytes_be(&mut buf[c * byte_len..(c + 1) * byte_len]); + } + BatchedMerkleTreeBackend::::hash_bytes(&buf) + }) + .collect(); + let reference_tree = + BatchedMerkleTree::::build_from_hashed_leaves(reference_leaves).unwrap(); + + let (_tree, root) = + Prover::::commit_columns_bit_reversed(&columns) + .unwrap(); + + assert_eq!(reference_tree.root, root); +}