From aa95fd246767da9ac875de93710151d290f6cc37 Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Tue, 21 Apr 2026 21:33:17 +0000 Subject: [PATCH 1/6] sequential column reads in commit_columns_bit_reversed Read columns at natural index k inside the parallel hashing loop, then apply in_place_bit_reverse_permute to the Commitment vector before building the Merkle tree. Same leaves as reading at br(row_idx) inside the loop; replaces scattered column reads (~2GB volume on MEMW_R) with sequential reads plus a 64MB in-place bit-reverse pass. --- crypto/stark/src/prover.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 41ccb8366..5fa7737ef 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -361,10 +361,11 @@ pub trait IsStarkProver< /// Builds a Merkle tree commitment from column-major LDE evaluations with /// bit-reverse permutation, without cloning the full evaluation matrix. /// - /// For each row index `i`, we hash `col_0[br(i)] || col_1[br(i)] || ...` - /// where `br(i)` is the bit-reversal of `i`. This produces the same Merkle - /// tree as the old clone + bit-reverse + columns2rows + batch_commit flow, - /// but avoids allocating the cloned and transposed matrices entirely. + /// Hashes `col_0[k] || col_1[k] || ...` for k = 0..num_rows (sequential column + /// reads, cache-friendly), then permutes the hash vector in bit-reversed order + /// so leaves[i] = hash(col_0[br(i)] || col_1[br(i)] || ...). Same Merkle tree + /// as reading at br(row_idx) inside the hashing loop, but the scattered column + /// access is replaced by a single small bit-reverse pass over 32-byte digests. fn commit_columns_bit_reversed( columns: &[Vec>], ) -> Option<(BatchedMerkleTree, Commitment)> @@ -392,21 +393,20 @@ pub trait IsStarkProver< #[cfg(not(feature = "parallel"))] let iter = 0..num_rows; - // One allocation per row (was one per field element): write all columns - // into a single buffer, then hash once. - let hashed_leaves: Vec = iter - .map(|row_idx| { - let br_idx = reverse_index(row_idx, num_rows as u64); + let mut hashed_leaves: Vec = iter + .map(|k| { let total_bytes = num_cols * byte_len; let mut buf = vec![0u8; total_bytes]; for col_idx in 0..num_cols { - columns[col_idx][br_idx] + columns[col_idx][k] .write_bytes_be(&mut buf[col_idx * byte_len..(col_idx + 1) * byte_len]); } BatchedMerkleTreeBackend::::hash_bytes(&buf) }) .collect(); + in_place_bit_reverse_permute(&mut hashed_leaves); + let tree = BatchedMerkleTree::::build_from_hashed_leaves(hashed_leaves)?; let root = tree.root; Some((tree, root)) From 6a003a2f907fe512ad4f13c09cfcac6c6d87f73d Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Wed, 22 Apr 2026 18:43:48 +0000 Subject: [PATCH 2/6] parallelize chunk_and_generate with par_chunks Phase 5 of trace build invokes chunk_and_generate 10 times; each call walked its chunks sequentially. MEMW alone produces ~12 chunks at fib_iterative_2M, so there is substantial per-chunk parallelism available on a free rayon pool (trace build runs before multi_prove). fib_iterative_2M on Linux x86_64, 12 cores, 3 samples: - prove wall-clock: 75.4s -> 74.3s median (-1.5%) - Trace build sub-phase: 4.56s -> 3.96s (-13.2%) - Verification against baseline binary: PASS --- prover/src/tables/trace_builder.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs index b1af698de..56636fc39 100644 --- a/prover/src/tables/trace_builder.rs +++ b/prover/src/tables/trace_builder.rs @@ -1654,15 +1654,23 @@ struct CollectedOps { } /// Chunk raw ops and generate one trace table per chunk. -fn chunk_and_generate( +fn chunk_and_generate( ops: &[T], max_rows: usize, - generate: impl Fn(&[T]) -> TraceTable, + generate: impl Fn(&[T]) -> TraceTable + Sync + Send, ) -> Vec> { if ops.is_empty() { vec![generate(&[])] } else { - ops.chunks(max_rows).map(generate).collect() + #[cfg(feature = "parallel")] + { + use rayon::prelude::*; + ops.par_chunks(max_rows).map(&generate).collect() + } + #[cfg(not(feature = "parallel"))] + { + ops.chunks(max_rows).map(generate).collect() + } } } From d43ae4e5c1f7c84b723b33b4355a2546bd9f943f Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Wed, 22 Apr 2026 19:40:40 +0000 Subject: [PATCH 3/6] skip redundant bit-reverse pair in R4 deep-composition LDE round_4 called evaluate_fft (which internally permutes the FFT output to natural order) and then in_place_bit_reverse_permute on the result to flip it back. Both permutes cancel. FRI commit_phase_from_evaluations pairs evals as chunks_exact(2) expecting {f(x), f(-x)} adjacency, which is exactly the bit-reversed output of the Bowers forward FFT. Added Polynomial::evaluate_fft_bit_reversed that skips the final permute, and called it from round_4. Result: two ~24ms permutes (at 2N=4M per table) eliminated per prove. fib_iterative_2M on Linux x86_64, 12 cores, 5 samples: - prove wall-clock: 75.4s -> 74.4s median (-1.3%), 75.5s -> 74.3s mean (-1.6%) - R4 interpolate+evaluate_fft sub-phase: 2.73s -> 1.95s (-29%) - CV 0.6% (2xCV=1.2% threshold, 1.3% improvement clears it) - Verification against baseline binary: PASS --- crypto/math/src/fft/polynomial.rs | 31 +++++++++++++++++++++++++++++++ crypto/stark/src/prover.rs | 11 +++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs index 9157903fd..129473207 100644 --- a/crypto/math/src/fft/polynomial.rs +++ b/crypto/math/src/fft/polynomial.rs @@ -80,6 +80,37 @@ impl Polynomial> { evaluate_fft_cpu::(&coeffs) } + /// Same as `evaluate_fft` but returns the evaluations in bit-reversed order, + /// skipping the final natural-order permutation. Use when the consumer expects + /// bit-reversed input (e.g. FRI commit phase, which pairs consecutive values as + /// {f(x), f(-x)}). + pub fn evaluate_fft_bit_reversed>( + poly: &Polynomial>, + blowup_factor: usize, + domain_size: Option, + ) -> Result>, FFTError> + where + E: Send + Sync, + { + let domain_size = domain_size.unwrap_or(0); + let len = core::cmp::max(poly.coeff_len(), domain_size).next_power_of_two() * blowup_factor; + if len.trailing_zeros() as u64 > F::TWO_ADICITY { + return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize)); + } + if poly.coefficients().is_empty() { + return Ok(vec![FieldElement::zero(); len]); + } + + let mut coeffs = poly.coefficients().to_vec(); + coeffs.resize(len, FieldElement::zero()); + + let order = len.trailing_zeros() as u64; + let layer_twiddles = + LayerTwiddles::::new(order).ok_or(FFTError::DomainSizeError(order as usize))?; + dispatch_fft(&mut coeffs, &layer_twiddles)?; + Ok(coeffs) + } + /// Returns `N` evaluations with an offset of this polynomial using FFT over a domain in a subfield F of E /// (so the results are P(w^i), with w being a primitive root of unity). /// `N = max(self.coeff_len(), domain_size).next_power_of_two() * blowup_factor`. diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 5fa7737ef..759ffac93 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use std::time::Instant; use crypto::fiat_shamir::is_transcript::IsStarkTranscript; -use math::fft::cpu::bit_reversing::{in_place_bit_reverse_permute, reverse_index}; +use math::fft::cpu::bit_reversing::reverse_index; use math::fft::cpu::bowers_fft::LayerTwiddles; use math::fft::errors::FFTError; @@ -1081,9 +1081,12 @@ pub trait IsStarkProver< let t_sub = Instant::now(); let deep_poly = Polynomial::interpolate_fft::(&deep_evals).expect("iFFT should succeed"); - let mut lde_evals = Polynomial::evaluate_fft::(&deep_poly, 1, Some(domain_size)) - .expect("FFT should succeed"); - in_place_bit_reverse_permute(&mut lde_evals); + // FRI commit_phase consumes bit-reversed evaluations natively. Request them + // directly from evaluate_fft_bit_reversed to avoid a pair of redundant permutes + // (evaluate_fft's internal natural-order permute + an external re-bit-reverse). + let lde_evals = + Polynomial::evaluate_fft_bit_reversed::(&deep_poly, 1, Some(domain_size)) + .expect("FFT should succeed"); #[cfg(feature = "instruments")] let r4_fft_dur = t_sub.elapsed(); From 901a71634f593522299162d52db180a31b1480ad Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Wed, 22 Apr 2026 19:53:18 +0000 Subject: [PATCH 4/6] parallelize in_place_bit_reverse_permute Every FFT call site ends with a sequential O(N) bit-reverse permutation. At N=4M elements this is ~24ms on its own, called dozens of times per prove across all column LDEs, composition-poly parts, and the R4 deep LDE. Bottlenecks the otherwise-parallel FFT pipeline (Amdahl). Swap pairs (i, br(i)) with i < br(i) are disjoint, so parallelization is safe with a Send/Sync raw-pointer wrapper (the `i < br(i)` predicate selects a unique owner per pair, so no two threads ever touch the same slot). Sequential fallback retained for N < 16K. fib_iterative_2M on Linux x86_64, 12 cores, 3 samples: - prove wall-clock: 75.4s -> 73.9s median (-2.0%), 75.5s -> 74.1s mean (-1.9%) - R2 decompose_and_extend_d2: 8.28s -> 7.78s (-6.0%) - R4 interpolate+evaluate_fft: 2.73s -> 2.40s (-12.1%) - CV 0.7% (2xCV=1.4% threshold, 2.0% improvement clears it comfortably) - Verification against baseline binary: PASS - All 121 stark lib tests + math bit_reverse tests pass --- crypto/math/src/fft/cpu/bit_reversing.rs | 41 ++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/crypto/math/src/fft/cpu/bit_reversing.rs b/crypto/math/src/fft/cpu/bit_reversing.rs index f225dd5e0..fd6936ff7 100644 --- a/crypto/math/src/fft/cpu/bit_reversing.rs +++ b/crypto/math/src/fft/cpu/bit_reversing.rs @@ -1,7 +1,42 @@ /// In-place bit-reverse permutation algorithm. Requires input length to be a power of two. -pub fn in_place_bit_reverse_permute(input: &mut [E]) { - for i in 0..input.len() { - let bit_reversed_index = reverse_index(i, input.len() as u64); +pub fn in_place_bit_reverse_permute(input: &mut [E]) { + let n = input.len(); + #[cfg(feature = "parallel")] + { + // Pair-parallel swap: each pair (i, br(i)) with i < br(i) is independent of all + // other pairs (disjoint indices), so threads can swap concurrently provided they + // never touch the same memory location. `if br > i` selects exactly one owner + // per pair, so no two threads ever write the same slot. + const PARALLEL_BITREV_THRESHOLD: usize = 1 << 14; + if n >= PARALLEL_BITREV_THRESHOLD { + use rayon::prelude::*; + struct SendPtr(*mut E); + impl Copy for SendPtr {} + impl Clone for SendPtr { + fn clone(&self) -> Self { + *self + } + } + unsafe impl Send for SendPtr {} + unsafe impl Sync for SendPtr {} + let ptr = SendPtr(input.as_mut_ptr()); + (0..n).into_par_iter().for_each(|i| { + let br = reverse_index(i, n as u64); + if br > i { + // SAFETY: (i, br) uniquely identifies this pair (smaller index is owner), + // so no two threads race on the same `ptr.0.add(k)` slot. Both indices + // are in-bounds since i < n and br < n. + let p = ptr; + unsafe { + core::ptr::swap(p.0.add(i), p.0.add(br)); + } + } + }); + return; + } + } + for i in 0..n { + let bit_reversed_index = reverse_index(i, n as u64); if bit_reversed_index > i { input.swap(i, bit_reversed_index); } From bccd4d204fe5ebc361d600df32fd05b542110ed4 Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Wed, 22 Apr 2026 20:15:31 +0000 Subject: [PATCH 5/6] chunked parallel inplace_batch_inverse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Montgomery batch inverse has a serial prefix-product dependency, but chunks are independent: each chunk inverts its own elements without needing values from other chunks. Trade K-1 extra field inversions (~1000 mults each in Goldilocks, negligible next to the ~2N mults per chunk) for K-way parallelism. Threshold at 2^16 elements so short batches (single-FRI-layer twiddles at smaller layers, inv arrays in small tables) keep the sequential path. Above threshold, split into num_threads chunks and invert each independently via par_chunks_mut. This is surprisingly impactful because batch_inverse is called on large inputs throughout the prover — coset-point inverses in R2 decompose, constraint-denominator inverses, OOD x_i - z inverses, deep-composition inv_h / inv_t, FRI coset twiddles, etc. Every summed-over-tables hot denominator pipeline hits it. fib_iterative_2M on Linux x86_64, 12 cores, 3 samples: - prove wall-clock: 75.4s -> 72.45s median (-3.9%), 75.5s -> 72.4s mean (-4.1%) - R3 OOD evaluation: 5.66s -> 4.30s (-24%) - R4 deep_composition_poly_evals: 5.62s -> 4.42s (-21%) - R4 queries & openings: 1.58s -> 1.14s (-28%) - R4 interpolate+evaluate_fft: 2.87s -> 2.70s (-6%) - CV 0.4% (2xCV=0.8%, 3.9% improvement clears it easily) - Verification against baseline binary: PASS - All math batch_inverse + stark 121 lib tests pass --- crypto/math/src/field/element.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/crypto/math/src/field/element.rs b/crypto/math/src/field/element.rs index 9c2ac3258..e34ec0fb7 100644 --- a/crypto/math/src/field/element.rs +++ b/crypto/math/src/field/element.rs @@ -51,7 +51,29 @@ impl FieldElement { /// Computes the multiplicative inverses of a slice of field elements /// The algorithm just performs one inversion and several multiplications and should be used /// when wanting to invert several elements together - pub fn inplace_batch_inverse(numbers: &mut [Self]) -> Result<(), FieldError> { + pub fn inplace_batch_inverse(numbers: &mut [Self]) -> Result<(), FieldError> + where + Self: Send + Sync, + { + #[cfg(feature = "parallel")] + { + // Montgomery batch inverse has a serial prefix-product dependency, but + // chunks are independent — each chunk inverts its own elements without + // needing values from other chunks. Trade K-1 extra field inversions + // (negligible vs ~2N mults per chunk) for K-way parallelism. + const PARALLEL_BATCH_INV_THRESHOLD: usize = 1 << 16; + if numbers.len() >= PARALLEL_BATCH_INV_THRESHOLD { + use rayon::prelude::*; + let chunk_size = numbers.len().div_ceil(rayon::current_num_threads().max(1)); + return numbers + .par_chunks_mut(chunk_size) + .try_for_each(Self::inplace_batch_inverse_sequential); + } + } + Self::inplace_batch_inverse_sequential(numbers) + } + + fn inplace_batch_inverse_sequential(numbers: &mut [Self]) -> Result<(), FieldError> { if numbers.is_empty() { return Ok(()); } From c1dd55632baa2d1f12912eaccb0888055d1eb186 Mon Sep 17 00:00:00 2001 From: Gabriel Bosio Date: Wed, 22 Apr 2026 21:35:15 +0000 Subject: [PATCH 6/6] import in_place_bit_reverse_permute in prover --- crypto/stark/src/prover.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 759ffac93..d418f7773 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use std::time::Instant; use crypto::fiat_shamir::is_transcript::IsStarkTranscript; -use math::fft::cpu::bit_reversing::reverse_index; +use math::fft::cpu::bit_reversing::{in_place_bit_reverse_permute, reverse_index}; use math::fft::cpu::bowers_fft::LayerTwiddles; use math::fft::errors::FFTError;