From 0c1d31c00bf5504c685e5cf6e3aa70347b7fc725 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 20 May 2026 09:47:25 -0300
Subject: [PATCH 01/10] refactor(math): remove dead Deserializable trait and
 unreachable error variants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`Deserializable` had a single impl (`Proof<T>`) whose only `deserialize`
call site was inside that impl itself — a recursion with no external
entry point. Production deserialization uses `bincode`/serde everywhere.
Delete the trait and the `Proof<T>` impl.

`merkle_tree/proof.rs` also carried a dead `Serializable` impl behind
`#[cfg(feature = "alloc")]` — a feature the `crypto` crate does not even
define, so the block never compiled. It referenced `math::traits::Serializable`,
which does not exist. Remove it.

Error cleanup (none constructed anywhere in the workspace):
- `DeserializationError` enum + `From<ByteConversionError>` impl — only
  reachable through the now-deleted `Deserializable`.
- `PairingError` enum — lambda_vm has no pairings.
- `ByteConversionError::{InvalidValue, PointNotInSubgroup, ValueNotCompressed}`
  — the latter two are elliptic-curve concepts; there is no EC here.
- `CreationError::InvalidDecString`.

Pure dead-code removal; `cargo build`/`cargo test -p math` unchanged
(216 tests pass).
---
 crypto/crypto/src/merkle_tree/proof.rs | 33 --------------------------
 crypto/math/src/errors.rs              | 28 ----------------------
 crypto/math/src/traits.rs              |  9 -------
 3 files changed, 70 deletions(-)
diff --git a/crypto/crypto/src/merkle_tree/proof.rs b/crypto/crypto/src/merkle_tree/proof.rs
index 20d5452a2..3114a3436 100644
--- a/crypto/crypto/src/merkle_tree/proof.rs
+++ b/crypto/crypto/src/merkle_tree/proof.rs
@@ -1,7 +1,4 @@
 use alloc::{collections::BTreeMap, vec::Vec};
-#[cfg(feature = "alloc")]
-use math::traits::Serializable;
-use math::{errors::DeserializationError, traits::Deserializable};
 
 use super::{
     traits::IsMerkleTreeBackend,
@@ -41,36 +38,6 @@ impl<T: PartialEq + Eq> Proof<T> {
     }
 }
 
-#[cfg(feature = "alloc")]
-impl<T> Serializable for Proof<T>
-where
-    T: Serializable + PartialEq + Eq,
-{
-    fn serialize(&self) -> Vec<u8> {
-        self.merkle_path
-            .iter()
-            .flat_map(|node| node.serialize())
-            .collect()
-    }
-}
-
-impl<T> Deserializable for Proof<T>
-where
-    T: Deserializable + PartialEq + Eq,
-{
-    fn deserialize(bytes: &[u8]) -> Result<Self, DeserializationError>
-    where
-        Self: Sized,
-    {
-        let mut merkle_path = Vec::new();
-        for elem in bytes[0..].chunks(8) {
-            let node = T::deserialize(elem)?;
-            merkle_path.push(node);
-        }
-        Ok(Self { merkle_path })
-    }
-}
-
 /// Stores all the nodes needed to prove the inclusion of multiple leaves.
 ///
 /// # Proof Ordering
diff --git a/crypto/math/src/errors.rs b/crypto/math/src/errors.rs
index db994e0cd..3c748f0de 100644
--- a/crypto/math/src/errors.rs
+++ b/crypto/math/src/errors.rs
@@ -2,41 +2,13 @@
 pub enum ByteConversionError {
     FromBEBytesError,
     FromLEBytesError,
-    InvalidValue,
-    PointNotInSubgroup,
-    ValueNotCompressed,
     ValueNotReduced,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub enum CreationError {
     InvalidHexString,
-    InvalidDecString,
     HexStringIsTooBig,
     CanonicalOutOfRange,
     EmptyString,
 }
-
-#[derive(Debug, PartialEq, Eq)]
-pub enum DeserializationError {
-    InvalidAmountOfBytes,
-    FieldFromBytesError,
-    PointerSizeError,
-    InvalidValue,
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub enum PairingError {
-    PointNotInSubgroup,
-    DivisionByZero,
-}
-
-impl From<ByteConversionError> for DeserializationError {
-    fn from(error: ByteConversionError) -> Self {
-        match error {
-            ByteConversionError::FromBEBytesError => DeserializationError::FieldFromBytesError,
-            ByteConversionError::FromLEBytesError => DeserializationError::FieldFromBytesError,
-            _ => DeserializationError::InvalidValue,
-        }
-    }
-}
diff --git a/crypto/math/src/traits.rs b/crypto/math/src/traits.rs
index b92441312..7924a177c 100644
--- a/crypto/math/src/traits.rs
+++ b/crypto/math/src/traits.rs
@@ -1,5 +1,3 @@
-use crate::errors::DeserializationError;
-
 use crate::errors::ByteConversionError;
 /// A trait for converting an element to and from its byte representation and
 /// for getting an element from its byte representation in big-endian or
@@ -98,10 +96,3 @@ impl ByteConversion for u64 {
         ))
     }
 }
-
-/// Deserialize function without args
-pub trait Deserializable {
-    fn deserialize(bytes: &[u8]) -> Result<Self, DeserializationError>
-    where
-        Self: Sized;
-}

From 4c94b8c504c69b6a53243449613b7378b3ba33cc Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 20 May 2026 09:59:35 -0300
Subject: [PATCH 02/10] refactor(math): remove vestigial helpers module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`helpers::next_power_of_two(u64)` was bit-for-bit equivalent to stdlib
`u64::next_power_of_two()`. Its only caller, `resize_to_next_power_of_two`,
was used solely by the `fibonacci_rap` example — production trace sizing
already calls stdlib `.next_power_of_two()` directly everywhere.

Move `resize_to_next_power_of_two` into `examples/fibonacci_rap.rs` as a
local helper (rewritten on `usize` with the stdlib call, dropping the old
`try_into().unwrap()` TODO), and delete the `helpers` module.
---
 crypto/math/src/helpers.rs                 | 27 ----------------------
 crypto/math/src/lib.rs                     |  1 -
 crypto/stark/src/examples/fibonacci_rap.rs | 10 +++++++-
 3 files changed, 9 insertions(+), 29 deletions(-)
 delete mode 100644 crypto/math/src/helpers.rs

diff --git a/crypto/math/src/helpers.rs b/crypto/math/src/helpers.rs
deleted file mode 100644
index 9a04e33b8..000000000
--- a/crypto/math/src/helpers.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-#[cfg(feature = "alloc")]
-use crate::field::{element::FieldElement, traits::IsFFTField};
-
-/// Computes the power of two that is equal or greater than n
-pub fn next_power_of_two(n: u64) -> u64 {
-    if n <= 1 {
-        1
-    } else {
-        (u64::MAX >> (n - 1).leading_zeros()) + 1
-    }
-}
-
-/// Pads the trace table with zeros until the length of the columns of the trace
-/// is equal to a power of 2
-/// This is required to ensure that we can use the radix-2 Cooley-Tukey FFT algorithm
-#[cfg(feature = "alloc")]
-pub fn resize_to_next_power_of_two<F: IsFFTField>(
-    trace_colums: &mut [alloc::vec::Vec<FieldElement<F>>],
-) {
-    trace_colums.iter_mut().for_each(|col| {
-        // TODO: Remove this unwrap. This may panic if the usize cant be
-        // casted into a u64.
-        let col_len = col.len().try_into().unwrap();
-        let next_power_of_two_len = next_power_of_two(col_len);
-        col.resize(next_power_of_two_len as usize, FieldElement::<F>::zero())
-    })
-}
diff --git a/crypto/math/src/lib.rs b/crypto/math/src/lib.rs
index 9d5e6dd97..11d5cda31 100644
--- a/crypto/math/src/lib.rs
+++ b/crypto/math/src/lib.rs
@@ -5,7 +5,6 @@ extern crate alloc;
 
 pub mod errors;
 pub mod field;
-pub mod helpers;
 pub mod spill_safe;
 pub mod traits;
 pub mod unsigned_integer;
diff --git a/crypto/stark/src/examples/fibonacci_rap.rs b/crypto/stark/src/examples/fibonacci_rap.rs
index dfd557b83..10f1827d2 100644
--- a/crypto/stark/src/examples/fibonacci_rap.rs
+++ b/crypto/stark/src/examples/fibonacci_rap.rs
@@ -13,10 +13,18 @@ use crate::{
 use crypto::fiat_shamir::is_transcript::IsStarkTranscript;
 use math::{
     field::{element::FieldElement, traits::IsFFTField},
-    helpers::resize_to_next_power_of_two,
     traits::ByteConversion,
 };
 
+/// Pads each trace column with zeros up to the next power of two so the
+/// radix-2 FFT can be applied. Local to this example — the production
+/// prover sizes its traces directly.
+fn resize_to_next_power_of_two<F: IsFFTField>(trace_columns: &mut [Vec<FieldElement<F>>]) {
+    for col in trace_columns.iter_mut() {
+        col.resize(col.len().next_power_of_two(), FieldElement::<F>::zero());
+    }
+}
+
 #[derive(Clone)]
 struct FibConstraint<F: IsFFTField> {
     phantom: PhantomData<F>,

From 4d3bc3db3d53ca4f140cc99d4d0a10dd4d44a1ce Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 20 May 2026 10:00:40 -0300
Subject: [PATCH 03/10] refactor(math): gate evaluate_fft_bit_reversed behind
 cfg(test)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`evaluate_fft_bit_reversed` was a non-gated `pub fn` but is only called
from the FFT property tests — no production caller in stark or prover.
Gate it `#[cfg(test)]` so it stops appearing in the production API
surface. Restore the gate (or remove it) when a real caller appears.
---
 crypto/math/src/fft/polynomial.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs
index ccc1ac391..039c02549 100644
--- a/crypto/math/src/fft/polynomial.rs
+++ b/crypto/math/src/fft/polynomial.rs
@@ -84,6 +84,10 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
     /// skipping the final natural-order permutation. Use when the consumer expects
     /// bit-reversed input (e.g. FRI commit phase, which pairs consecutive values as
     /// {f(x), f(-x)}).
+    ///
+    /// Currently exercised only by the FFT property tests; gated to keep it out
+    /// of the production surface until a non-test caller needs it.
+    #[cfg(test)]
     pub fn evaluate_fft_bit_reversed<F: IsFFTField + IsSubFieldOf<E>>(
         poly: &Polynomial<FieldElement<E>>,
         blowup_factor: usize,

From aa182961df1f2b2906ccb4f240267d393f0db37b Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 20 May 2026 10:08:58 -0300
Subject: [PATCH 04/10] refactor(math): gate test-only Polynomial::{truncate,
 reverse}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both methods are called only from `fast_division` / `invert_polynomial_mod`,
which are themselves already `#[cfg(test)]`. No production caller in stark
or prover. Gate them `#[cfg(test)]` so they leave the production API surface.

`long_division_with_remainder` is left public — it is used by the
debug-checks path (`check_boundary_polys_divisibility`) and is a
legitimate general polynomial operation other code could want.
---
 crypto/math/src/polynomial/mod.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/crypto/math/src/polynomial/mod.rs b/crypto/math/src/polynomial/mod.rs
index 74cca39df..098362d29 100644
--- a/crypto/math/src/polynomial/mod.rs
+++ b/crypto/math/src/polynomial/mod.rs
@@ -191,6 +191,8 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
         }
     }
 
+    /// Only used by the test-only `fast_division` / `invert_polynomial_mod`.
+    #[cfg(test)]
     pub fn truncate(&self, k: usize) -> Self {
         if k == 0 {
             Self::zero()
@@ -198,6 +200,9 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
             Self::new(&self.coefficients[0..k.min(self.coefficients.len())])
         }
     }
+
+    /// Only used by the test-only `fast_division`.
+    #[cfg(test)]
     pub fn reverse(&self, d: usize) -> Self {
         let mut coeffs = self.coefficients.clone();
         coeffs.resize(d + 1, FieldElement::zero());

From 5e459715664cd599824a4d3ddbcdeb94e7d14878 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 20 May 2026 10:25:20 -0300
Subject: [PATCH 05/10] refactor(math): flatten fft/cpu/* up to fft/*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `fft/cpu/` directory implied a `fft/gpu/` sibling that never existed —
GPU code lives in the separate `crypto/math-cuda` crate. The `cpu` nesting
was a vestigial holdover.

Move `bit_reversing`, `bowers_fft`, `roots_of_unity`, `fft`, and
`bowers_fft_tests` from `fft/cpu/` up to `fft/`; delete `fft/cpu/mod.rs`
and merge its declarations into `fft/mod.rs`. Update every
`fft::cpu::...` / `super::cpu::...` path across math, stark, prover, and
math-cuda to `fft::...`.

Pure module move — no logic change. 216 math tests pass; stark, prover,
math-cuda all build.
---
 crypto/math-cuda/tests/bench_quick.rs                  |  2 +-
 crypto/math-cuda/tests/lde.rs                          |  2 +-
 crypto/math-cuda/tests/lde_batch.rs                    |  2 +-
 crypto/math-cuda/tests/lde_batch_ext3.rs               |  2 +-
 crypto/math/src/fft/{cpu => }/bit_reversing.rs         |  0
 crypto/math/src/fft/{cpu => }/bowers_fft.rs            |  2 +-
 crypto/math/src/fft/{cpu => }/bowers_fft_tests.rs      |  8 ++++----
 crypto/math/src/fft/cpu/mod.rs                         |  8 --------
 crypto/math/src/fft/{cpu => }/fft.rs                   |  4 ++--
 crypto/math/src/fft/mod.rs                             | 10 +++++++++-
 crypto/math/src/fft/polynomial.rs                      |  8 ++++----
 crypto/math/src/fft/{cpu => }/roots_of_unity.rs        |  0
 crypto/math/src/fft/test_helpers.rs                    |  2 +-
 crypto/math/src/tests/barycentric_tests.rs             |  2 +-
 .../src/tests/fft_friendly_u64_goldilocks_tests.rs     |  2 +-
 crypto/math/src/tests/fft_tests.rs                     | 10 +++++-----
 crypto/stark/src/domain.rs                             |  2 +-
 crypto/stark/src/fri/fri_functions.rs                  |  2 +-
 crypto/stark/src/prover.rs                             |  4 ++--
 crypto/stark/src/tests/fri_tests.rs                    |  2 +-
 crypto/stark/src/verifier.rs                           |  2 +-
 prover/src/tables/bitwise.rs                           |  2 +-
 prover/src/tables/decode.rs                            |  2 +-
 prover/src/tables/keccak_rc.rs                         |  2 +-
 prover/src/tables/page.rs                              |  2 +-
 prover/src/tables/register.rs                          |  2 +-
 26 files changed, 43 insertions(+), 43 deletions(-)
 rename crypto/math/src/fft/{cpu => }/bit_reversing.rs (100%)
 rename crypto/math/src/fft/{cpu => }/bowers_fft.rs (99%)
 rename crypto/math/src/fft/{cpu => }/bowers_fft_tests.rs (99%)
 delete mode 100644 crypto/math/src/fft/cpu/mod.rs
 rename crypto/math/src/fft/{cpu => }/fft.rs (97%)
 rename crypto/math/src/fft/{cpu => }/roots_of_unity.rs (100%)

diff --git a/crypto/math-cuda/tests/bench_quick.rs b/crypto/math-cuda/tests/bench_quick.rs
index 9e3883085..55ece98b3 100644
--- a/crypto/math-cuda/tests/bench_quick.rs
+++ b/crypto/math-cuda/tests/bench_quick.rs
@@ -3,7 +3,7 @@
 
 use std::time::Instant;
 
-use math::fft::cpu::bowers_fft::LayerTwiddles;
+use math::fft::bowers_fft::LayerTwiddles;
 use math::field::element::FieldElement;
 use math::field::goldilocks::GoldilocksField;
 use math::field::traits::IsField;
diff --git a/crypto/math-cuda/tests/lde.rs b/crypto/math-cuda/tests/lde.rs
index 110997e6e..5bc65b9bc 100644
--- a/crypto/math-cuda/tests/lde.rs
+++ b/crypto/math-cuda/tests/lde.rs
@@ -2,7 +2,7 @@
 //! `Polynomial::coset_lde_full_expand` for a sweep of realistic sizes and
 //! blowup factors.
 
-use math::fft::cpu::bowers_fft::LayerTwiddles;
+use math::fft::bowers_fft::LayerTwiddles;
 use math::field::element::FieldElement;
 use math::field::goldilocks::GoldilocksField;
 use math::field::traits::{IsField, IsPrimeField};
diff --git a/crypto/math-cuda/tests/lde_batch.rs b/crypto/math-cuda/tests/lde_batch.rs
index b7120ca28..cfa4ab3c8 100644
--- a/crypto/math-cuda/tests/lde_batch.rs
+++ b/crypto/math-cuda/tests/lde_batch.rs
@@ -1,7 +1,7 @@
 //! Batched coset LDE must agree with running the CPU single-column LDE on
 //! each column independently. Sweeps a few realistic (n, blowup, m) tuples.
 
-use math::fft::cpu::bowers_fft::LayerTwiddles;
+use math::fft::bowers_fft::LayerTwiddles;
 use math::field::element::FieldElement;
 use math::field::goldilocks::GoldilocksField;
 use math::field::traits::{IsField, IsPrimeField};
diff --git a/crypto/math-cuda/tests/lde_batch_ext3.rs b/crypto/math-cuda/tests/lde_batch_ext3.rs
index c1156edaa..60adc8a49 100644
--- a/crypto/math-cuda/tests/lde_batch_ext3.rs
+++ b/crypto/math-cuda/tests/lde_batch_ext3.rs
@@ -1,7 +1,7 @@
 //! Ext3 batched coset LDE must agree with the CPU `coset_lde_full_expand`
 //! on each column independently when run over `FieldElement<Ext3>`.
 
-use math::fft::cpu::bowers_fft::LayerTwiddles;
+use math::fft::bowers_fft::LayerTwiddles;
 use math::field::element::FieldElement;
 use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
 use math::field::goldilocks::GoldilocksField;
diff --git a/crypto/math/src/fft/cpu/bit_reversing.rs b/crypto/math/src/fft/bit_reversing.rs
similarity index 100%
rename from crypto/math/src/fft/cpu/bit_reversing.rs
rename to crypto/math/src/fft/bit_reversing.rs
diff --git a/crypto/math/src/fft/cpu/bowers_fft.rs b/crypto/math/src/fft/bowers_fft.rs
similarity index 99%
rename from crypto/math/src/fft/cpu/bowers_fft.rs
rename to crypto/math/src/fft/bowers_fft.rs
index b9c826a42..60a15410e 100644
--- a/crypto/math/src/fft/cpu/bowers_fft.rs
+++ b/crypto/math/src/fft/bowers_fft.rs
@@ -16,7 +16,7 @@
 //! # Usage
 //!
 //! ```ignore
-//! use math::fft::cpu::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused};
+//! use math::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused};
 //!
 //! let order = 10u64; // FFT size = 2^10 = 1024
 //! let layer_twiddles = LayerTwiddles::<F>::new(order).unwrap();
diff --git a/crypto/math/src/fft/cpu/bowers_fft_tests.rs b/crypto/math/src/fft/bowers_fft_tests.rs
similarity index 99%
rename from crypto/math/src/fft/cpu/bowers_fft_tests.rs
rename to crypto/math/src/fft/bowers_fft_tests.rs
index ed6931f85..396f85f9b 100644
--- a/crypto/math/src/fft/cpu/bowers_fft_tests.rs
+++ b/crypto/math/src/fft/bowers_fft_tests.rs
@@ -3,9 +3,9 @@
 //! Separated from the main implementation to keep the library code concise.
 
 use super::bowers_fft::*;
-use crate::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
-use crate::fft::cpu::fft::in_place_nr_2radix_fft;
-use crate::fft::cpu::roots_of_unity::get_powers_of_primitive_root;
+use crate::fft::bit_reversing::in_place_bit_reverse_permute;
+use crate::fft::fft::in_place_nr_2radix_fft;
+use crate::fft::roots_of_unity::get_powers_of_primitive_root;
 use crate::field::element::FieldElement;
 use crate::field::goldilocks::GoldilocksField;
 use crate::field::traits::{IsFFTField, RootsConfig};
@@ -675,7 +675,7 @@ mod parallel_tests {
 #[cfg(feature = "parallel")]
 #[test]
 fn test_adaptive_threshold_scales_with_threads() {
-    use crate::fft::cpu::bowers_fft::bowers_fft_opt_fused_parallel;
+    use crate::fft::bowers_fft::bowers_fft_opt_fused_parallel;
     use rayon::ThreadPoolBuilder;
 
     // Test with different thread counts
diff --git a/crypto/math/src/fft/cpu/mod.rs b/crypto/math/src/fft/cpu/mod.rs
deleted file mode 100644
index 2fb4a1aad..000000000
--- a/crypto/math/src/fft/cpu/mod.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-pub mod bit_reversing;
-#[cfg(feature = "alloc")]
-pub mod bowers_fft;
-#[cfg(all(test, feature = "alloc"))]
-mod bowers_fft_tests;
-pub mod fft;
-#[cfg(feature = "alloc")]
-pub mod roots_of_unity;
diff --git a/crypto/math/src/fft/cpu/fft.rs b/crypto/math/src/fft/fft.rs
similarity index 97%
rename from crypto/math/src/fft/cpu/fft.rs
rename to crypto/math/src/fft/fft.rs
index 3f0882a51..2fca71f11 100644
--- a/crypto/math/src/fft/cpu/fft.rs
+++ b/crypto/math/src/fft/fft.rs
@@ -95,8 +95,8 @@ where
 
 #[cfg(all(test, feature = "alloc"))]
 mod tests {
-    use crate::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
-    use crate::fft::cpu::roots_of_unity::get_powers_of_primitive_root;
+    use crate::fft::bit_reversing::in_place_bit_reverse_permute;
+    use crate::fft::roots_of_unity::get_powers_of_primitive_root;
     use crate::fft::test_helpers::naive_matrix_dft_test;
     use crate::field::{test_fields::u64_test_field::U64TestField, traits::RootsConfig};
     use proptest::{collection, prelude::*};
diff --git a/crypto/math/src/fft/mod.rs b/crypto/math/src/fft/mod.rs
index 62610ad52..5c5e7b809 100644
--- a/crypto/math/src/fft/mod.rs
+++ b/crypto/math/src/fft/mod.rs
@@ -1,7 +1,15 @@
-pub mod cpu;
+pub mod bit_reversing;
+#[cfg(feature = "alloc")]
+pub mod bowers_fft;
 pub mod errors;
+pub mod fft;
 #[cfg(feature = "alloc")]
 pub mod polynomial;
+#[cfg(feature = "alloc")]
+pub mod roots_of_unity;
+
+#[cfg(all(test, feature = "alloc"))]
+mod bowers_fft_tests;
 
 #[cfg(all(test, feature = "alloc"))]
 pub(crate) mod test_helpers;
diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs
index 039c02549..784fcadea 100644
--- a/crypto/math/src/fft/polynomial.rs
+++ b/crypto/math/src/fft/polynomial.rs
@@ -6,13 +6,13 @@ use crate::{
 };
 use alloc::{vec, vec::Vec};
 
-use super::cpu::{
+use super::{
     bit_reversing::in_place_bit_reverse_permute,
     bowers_fft::{LayerTwiddles, bowers_fft_opt_fused, bowers_ifft_opt},
 };
 
 #[cfg(feature = "parallel")]
-use super::cpu::bowers_fft::{bowers_fft_opt_fused_parallel, bowers_ifft_opt_parallel};
+use super::bowers_fft::{bowers_fft_opt_fused_parallel, bowers_ifft_opt_parallel};
 
 /// Threshold for dispatching to parallel FFT.
 /// Below this size, sequential FFT is faster (avoids Rayon overhead).
@@ -464,7 +464,7 @@ mod tests {
 
     #[test]
     fn coset_lde_full_into_matches_coset_lde_full() {
-        use crate::fft::cpu::bowers_fft::LayerTwiddles;
+        use crate::fft::bowers_fft::LayerTwiddles;
 
         let offset = FE::from(3u64);
         let blowup_factor = 2;
@@ -512,7 +512,7 @@ mod tests {
 
     #[test]
     fn coset_lde_full_into_reuses_buffer() {
-        use crate::fft::cpu::bowers_fft::LayerTwiddles;
+        use crate::fft::bowers_fft::LayerTwiddles;
 
         let offset = FE::from(5u64);
         let blowup_factor = 2usize;
diff --git a/crypto/math/src/fft/cpu/roots_of_unity.rs b/crypto/math/src/fft/roots_of_unity.rs
similarity index 100%
rename from crypto/math/src/fft/cpu/roots_of_unity.rs
rename to crypto/math/src/fft/roots_of_unity.rs
diff --git a/crypto/math/src/fft/test_helpers.rs b/crypto/math/src/fft/test_helpers.rs
index bc072236a..92d002ad0 100644
--- a/crypto/math/src/fft/test_helpers.rs
+++ b/crypto/math/src/fft/test_helpers.rs
@@ -1,5 +1,5 @@
 use crate::{
-    fft::cpu::roots_of_unity::get_powers_of_primitive_root,
+    fft::roots_of_unity::get_powers_of_primitive_root,
     field::{
         element::FieldElement,
         traits::{IsFFTField, RootsConfig},
diff --git a/crypto/math/src/tests/barycentric_tests.rs b/crypto/math/src/tests/barycentric_tests.rs
index aadef40d9..aff4afbc6 100644
--- a/crypto/math/src/tests/barycentric_tests.rs
+++ b/crypto/math/src/tests/barycentric_tests.rs
@@ -1,4 +1,4 @@
-use crate::fft::cpu::roots_of_unity::get_powers_of_primitive_root_coset;
+use crate::fft::roots_of_unity::get_powers_of_primitive_root_coset;
 use crate::field::element::FieldElement;
 use crate::field::goldilocks::GoldilocksField;
 use crate::polynomial::{
diff --git a/crypto/math/src/tests/fft_friendly_u64_goldilocks_tests.rs b/crypto/math/src/tests/fft_friendly_u64_goldilocks_tests.rs
index 9019e017a..01bd0a176 100644
--- a/crypto/math/src/tests/fft_friendly_u64_goldilocks_tests.rs
+++ b/crypto/math/src/tests/fft_friendly_u64_goldilocks_tests.rs
@@ -272,7 +272,7 @@ fn test_from_i64_max_value() {
 #[cfg(all(feature = "std", not(feature = "instruments")))]
 mod fft_tests {
     use super::*;
-    use crate::fft::cpu::roots_of_unity::{
+    use crate::fft::roots_of_unity::{
         get_powers_of_primitive_root, get_powers_of_primitive_root_coset,
     };
     use crate::field::traits::{IsFFTField, RootsConfig};
diff --git a/crypto/math/src/tests/fft_tests.rs b/crypto/math/src/tests/fft_tests.rs
index 767507ad0..2427d64ae 100644
--- a/crypto/math/src/tests/fft_tests.rs
+++ b/crypto/math/src/tests/fft_tests.rs
@@ -1,6 +1,6 @@
 #[cfg(test)]
 mod fft_helpers_test {
-    use crate::fft::cpu::roots_of_unity::get_powers_of_primitive_root;
+    use crate::fft::roots_of_unity::get_powers_of_primitive_root;
     use crate::fft::test_helpers::naive_matrix_dft_test;
     use crate::field::element::FieldElement;
     use crate::field::test_fields::u64_test_field::U64TestField;
@@ -48,8 +48,8 @@ mod fft_helpers_test {
 mod fft_polynomial_tests {
     use crate::field::traits::IsField;
 
-    use crate::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
-    use crate::fft::cpu::roots_of_unity::{
+    use crate::fft::bit_reversing::in_place_bit_reverse_permute;
+    use crate::fft::roots_of_unity::{
         get_powers_of_primitive_root, get_powers_of_primitive_root_coset,
     };
     use crate::fft::polynomial::compose_fft;
@@ -306,8 +306,8 @@ mod fft_polynomial_tests {
 
 #[cfg(test)]
 mod roots_of_unity_tests {
-    use crate::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
-    use crate::fft::cpu::roots_of_unity::get_powers_of_primitive_root;
+    use crate::fft::bit_reversing::in_place_bit_reverse_permute;
+    use crate::fft::roots_of_unity::get_powers_of_primitive_root;
     use crate::field::test_fields::u64_test_field::U64TestField;
     use crate::field::traits::RootsConfig;
     use proptest::prelude::*;
diff --git a/crypto/stark/src/domain.rs b/crypto/stark/src/domain.rs
index 4ee720aae..a435ab4f9 100644
--- a/crypto/stark/src/domain.rs
+++ b/crypto/stark/src/domain.rs
@@ -1,5 +1,5 @@
 use math::{
-    fft::cpu::roots_of_unity::get_powers_of_primitive_root_coset,
+    fft::roots_of_unity::get_powers_of_primitive_root_coset,
     field::{
         element::FieldElement,
         traits::{IsFFTField, IsField, IsSubFieldOf},
diff --git a/crypto/stark/src/fri/fri_functions.rs b/crypto/stark/src/fri/fri_functions.rs
index 8bd355ec4..bc27f6476 100644
--- a/crypto/stark/src/fri/fri_functions.rs
+++ b/crypto/stark/src/fri/fri_functions.rs
@@ -1,4 +1,4 @@
-use math::fft::cpu::{
+use math::fft::{
     bit_reversing::in_place_bit_reverse_permute, roots_of_unity::get_powers_of_primitive_root_coset,
 };
 use math::field::{
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 68f50ea53..2378b221c 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -4,8 +4,8 @@ use std::sync::Arc;
 use std::time::Instant;
 
 use crypto::fiat_shamir::is_transcript::IsStarkTranscript;
-use math::fft::cpu::bit_reversing::{in_place_bit_reverse_permute, reverse_index};
-use math::fft::cpu::bowers_fft::LayerTwiddles;
+use math::fft::bit_reversing::{in_place_bit_reverse_permute, reverse_index};
+use math::fft::bowers_fft::LayerTwiddles;
 use math::fft::errors::FFTError;
 
 use log::info;
diff --git a/crypto/stark/src/tests/fri_tests.rs b/crypto/stark/src/tests/fri_tests.rs
index 8012d0b0c..503d0946a 100644
--- a/crypto/stark/src/tests/fri_tests.rs
+++ b/crypto/stark/src/tests/fri_tests.rs
@@ -1,5 +1,5 @@
 use crate::fri::fri_functions::{compute_coset_twiddles_inv, fold_evaluations_in_place};
-use math::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
+use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::field::element::FieldElement;
 use math::field::goldilocks::GoldilocksField;
 use math::field::traits::IsField;
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index a1a68930a..3091a5c54 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -18,7 +18,7 @@ use log::error;
 #[cfg(feature = "debug-checks")]
 use log::info;
 use math::{
-    fft::cpu::bit_reversing::reverse_index,
+    fft::bit_reversing::reverse_index,
     field::{
         element::FieldElement,
         traits::{IsFFTField, IsField, IsSubFieldOf},
diff --git a/prover/src/tables/bitwise.rs b/prover/src/tables/bitwise.rs
index 455f696f2..77001773f 100644
--- a/prover/src/tables/bitwise.rs
+++ b/prover/src/tables/bitwise.rs
@@ -28,7 +28,7 @@
 
 use std::sync::OnceLock;
 
-use math::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
+use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
 use stark::config::{BatchedMerkleTree, Commitment};
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
diff --git a/prover/src/tables/decode.rs b/prover/src/tables/decode.rs
index 913867b59..1c87f3862 100644
--- a/prover/src/tables/decode.rs
+++ b/prover/src/tables/decode.rs
@@ -38,7 +38,7 @@
 use executor::elf::Elf;
 use executor::vm::instruction::decoding::{Instruction, InstructionError};
 use executor::vm::memory::U64HashMap;
-use math::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
+use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
 use stark::config::{BatchedMerkleTree, Commitment};
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
diff --git a/prover/src/tables/keccak_rc.rs b/prover/src/tables/keccak_rc.rs
index c2e14d643..92f21a909 100644
--- a/prover/src/tables/keccak_rc.rs
+++ b/prover/src/tables/keccak_rc.rs
@@ -9,7 +9,7 @@
 
 use std::sync::OnceLock;
 
-use math::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
+use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::field::element::FieldElement;
 use math::polynomial::Polynomial;
 use stark::config::{BatchedMerkleTree, Commitment};
diff --git a/prover/src/tables/page.rs b/prover/src/tables/page.rs
index a4597e1b8..f196dc659 100644
--- a/prover/src/tables/page.rs
+++ b/prover/src/tables/page.rs
@@ -33,7 +33,7 @@
 use std::collections::HashMap;
 use std::sync::OnceLock;
 
-use math::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
+use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
 use stark::config::{BatchedMerkleTree, Commitment};
 use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing};
diff --git a/prover/src/tables/register.rs b/prover/src/tables/register.rs
index 976b40444..c73498737 100644
--- a/prover/src/tables/register.rs
+++ b/prover/src/tables/register.rs
@@ -20,7 +20,7 @@
 
 use std::collections::HashMap;
 
-use math::fft::cpu::bit_reversing::in_place_bit_reverse_permute;
+use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
 use stark::config::{BatchedMerkleTree, Commitment};
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};

From e2986fcd7582953b46d3ca418baed55bbc741187 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 20 May 2026 10:29:50 -0300
Subject: [PATCH 06/10] refactor(math): collapse polynomial/ folder into one
 polynomial.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two problems compounded:
1. `polynomial/` was a folder holding a single file (`mod.rs`) — the
   directory level served no purpose.
2. Two files were named `polynomial` (`polynomial/mod.rs` and
   `fft/polynomial.rs`), both defining `impl Polynomial<...>` blocks.

Merge `fft/polynomial.rs` (the FFT-specific `impl Polynomial` blocks plus
the `evaluate_fft_cpu` / `interpolate_fft_cpu` free functions) into
`polynomial/mod.rs`, then rename it to a top-level `crypto/math/src/polynomial.rs`
and delete the folder.

Result: one `polynomial.rs` holding the `Polynomial` type and all of its
methods (core ring ops + FFT). `fft/` is now purely the FFT *algorithm*
layer (Bowers, twiddles, bit-reversal, roots of unity).

Also delete the stale `polynomial/README.md` — inherited lambdaworks
docs referencing multilinear-polynomial files that don't exist in
lambda_vm.

Updated the one external path (`fft::polynomial::compose_fft` ->
`polynomial::compose_fft`). 216 math + 124 stark tests pass.
---
 crypto/math/src/fft/mod.rs                    |   2 -
 crypto/math/src/fft/polynomial.rs             | 561 ------------------
 .../src/{polynomial/mod.rs => polynomial.rs}  | 557 ++++++++++++++++-
 crypto/math/src/polynomial/README.md          | 131 ----
 crypto/math/src/tests/fft_tests.rs            |   2 +-
 5 files changed, 557 insertions(+), 696 deletions(-)
 delete mode 100644 crypto/math/src/fft/polynomial.rs
 rename crypto/math/src/{polynomial/mod.rs => polynomial.rs} (58%)
 delete mode 100644 crypto/math/src/polynomial/README.md

diff --git a/crypto/math/src/fft/mod.rs b/crypto/math/src/fft/mod.rs
index 5c5e7b809..ee8b6bd73 100644
--- a/crypto/math/src/fft/mod.rs
+++ b/crypto/math/src/fft/mod.rs
@@ -4,8 +4,6 @@ pub mod bowers_fft;
 pub mod errors;
 pub mod fft;
 #[cfg(feature = "alloc")]
-pub mod polynomial;
-#[cfg(feature = "alloc")]
 pub mod roots_of_unity;
 
 #[cfg(all(test, feature = "alloc"))]
diff --git a/crypto/math/src/fft/polynomial.rs b/crypto/math/src/fft/polynomial.rs
deleted file mode 100644
index 784fcadea..000000000
--- a/crypto/math/src/fft/polynomial.rs
+++ /dev/null
@@ -1,561 +0,0 @@
-use crate::fft::errors::FFTError;
-use crate::field::traits::{IsField, IsSubFieldOf};
-use crate::{
-    field::{element::FieldElement, traits::IsFFTField},
-    polynomial::Polynomial,
-};
-use alloc::{vec, vec::Vec};
-
-use super::{
-    bit_reversing::in_place_bit_reverse_permute,
-    bowers_fft::{LayerTwiddles, bowers_fft_opt_fused, bowers_ifft_opt},
-};
-
-#[cfg(feature = "parallel")]
-use super::bowers_fft::{bowers_fft_opt_fused_parallel, bowers_ifft_opt_parallel};
-
-/// Threshold for dispatching to parallel FFT.
-/// Below this size, sequential FFT is faster (avoids Rayon overhead).
-/// At 2^14 = 16384 elements, the parallel version starts to win
-/// because later butterfly layers have enough blocks for effective parallelism.
-#[cfg(feature = "parallel")]
-const PARALLEL_FFT_THRESHOLD: usize = 1 << 14;
-
-/// Dispatch forward FFT (DIF) to parallel or sequential implementation based on buffer size.
-#[inline]
-fn dispatch_fft<F: IsFFTField + IsSubFieldOf<E>, E: IsField + Send + Sync>(
-    buffer: &mut [FieldElement<E>],
-    twiddles: &LayerTwiddles<F>,
-) -> Result<(), FFTError> {
-    #[cfg(feature = "parallel")]
-    {
-        if buffer.len() >= PARALLEL_FFT_THRESHOLD {
-            return bowers_fft_opt_fused_parallel(buffer, twiddles);
-        }
-    }
-    bowers_fft_opt_fused(buffer, twiddles)
-}
-
-/// Dispatch inverse FFT (DIT) to parallel or sequential implementation based on buffer size.
-#[inline]
-fn dispatch_ifft<F: IsFFTField + IsSubFieldOf<E>, E: IsField + Send + Sync>(
-    buffer: &mut [FieldElement<E>],
-    twiddles: &LayerTwiddles<F>,
-) -> Result<(), FFTError> {
-    #[cfg(feature = "parallel")]
-    {
-        if buffer.len() >= PARALLEL_FFT_THRESHOLD {
-            return bowers_ifft_opt_parallel(buffer, twiddles);
-        }
-    }
-    bowers_ifft_opt(buffer, twiddles)
-}
-
-impl<E: IsField> Polynomial<FieldElement<E>> {
-    /// Returns `N` evaluations of this polynomial using FFT over a domain in a subfield F of E (so the results
-    /// are P(w^i), with w being a primitive root of unity).
-    /// `N = max(self.coeff_len(), domain_size).next_power_of_two() * blowup_factor`.
-    /// If `domain_size` is `None`, it defaults to 0.
-    pub fn evaluate_fft<F: IsFFTField + IsSubFieldOf<E>>(
-        poly: &Polynomial<FieldElement<E>>,
-        blowup_factor: usize,
-        domain_size: Option<usize>,
-    ) -> Result<Vec<FieldElement<E>>, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let domain_size = domain_size.unwrap_or(0);
-        let len = core::cmp::max(poly.coeff_len(), domain_size).next_power_of_two() * blowup_factor;
-        if len.trailing_zeros() as u64 > F::TWO_ADICITY {
-            return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize));
-        }
-        if poly.coefficients().is_empty() {
-            return Ok(vec![FieldElement::zero(); len]);
-        }
-
-        let mut coeffs = poly.coefficients().to_vec();
-        coeffs.resize(len, FieldElement::zero());
-        // padding with zeros will make FFT return more evaluations of the same polynomial.
-
-        evaluate_fft_cpu::<F, E>(&coeffs)
-    }
-
-    /// Same as `evaluate_fft` but returns the evaluations in bit-reversed order,
-    /// skipping the final natural-order permutation. Use when the consumer expects
-    /// bit-reversed input (e.g. FRI commit phase, which pairs consecutive values as
-    /// {f(x), f(-x)}).
-    ///
-    /// Currently exercised only by the FFT property tests; gated to keep it out
-    /// of the production surface until a non-test caller needs it.
-    #[cfg(test)]
-    pub fn evaluate_fft_bit_reversed<F: IsFFTField + IsSubFieldOf<E>>(
-        poly: &Polynomial<FieldElement<E>>,
-        blowup_factor: usize,
-        domain_size: Option<usize>,
-    ) -> Result<Vec<FieldElement<E>>, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let domain_size = domain_size.unwrap_or(0);
-        let len = core::cmp::max(poly.coeff_len(), domain_size).next_power_of_two() * blowup_factor;
-        if len.trailing_zeros() as u64 > F::TWO_ADICITY {
-            return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize));
-        }
-        if poly.coefficients().is_empty() {
-            return Ok(vec![FieldElement::zero(); len]);
-        }
-
-        let mut coeffs = poly.coefficients().to_vec();
-        coeffs.resize(len, FieldElement::zero());
-
-        evaluate_fft_cpu_raw::<F, E>(&coeffs, false)
-    }
-
-    /// Returns `N` evaluations with an offset of this polynomial using FFT over a domain in a subfield F of E
-    /// (so the results are P(w^i), with w being a primitive root of unity).
-    /// `N = max(self.coeff_len(), domain_size).next_power_of_two() * blowup_factor`.
-    /// If `domain_size` is `None`, it defaults to 0.
-    pub fn evaluate_offset_fft<F: IsFFTField + IsSubFieldOf<E>>(
-        poly: &Polynomial<FieldElement<E>>,
-        blowup_factor: usize,
-        domain_size: Option<usize>,
-        offset: &FieldElement<F>,
-    ) -> Result<Vec<FieldElement<E>>, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let scaled = poly.scale(offset);
-        Polynomial::evaluate_fft::<F>(&scaled, blowup_factor, domain_size)
-    }
-
-    /// Returns a new polynomial that interpolates `(w^i, fft_evals[i])`, with `w` being a
-    /// Nth primitive root of unity in a subfield F of E, and `i in 0..N`, with `N = fft_evals.len()`.
-    /// This is considered to be the inverse operation of [Self::evaluate_fft()].
-    pub fn interpolate_fft<F: IsFFTField + IsSubFieldOf<E>>(
-        fft_evals: &[FieldElement<E>],
-    ) -> Result<Self, FFTError>
-    where
-        E: Send + Sync,
-    {
-        interpolate_fft_cpu::<F, E>(fft_evals)
-    }
-
-    /// Returns a new polynomial that interpolates offset `(w^i, fft_evals[i])`, with `w` being a
-    /// Nth primitive root of unity in a subfield F of E, and `i in 0..N`, with `N = fft_evals.len()`.
-    /// This is considered to be the inverse operation of [Self::evaluate_offset_fft()].
-    pub fn interpolate_offset_fft<F: IsFFTField + IsSubFieldOf<E>>(
-        fft_evals: &[FieldElement<E>],
-        offset: &FieldElement<F>,
-    ) -> Result<Polynomial<FieldElement<E>>, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let scaled = Polynomial::interpolate_fft::<F>(fft_evals)?;
-        Ok(scaled.scale(&offset.inv().unwrap()))
-    }
-
-    /// Compute the coset LDE with pre-computed twiddle factors and pre-computed weights.
-    ///
-    /// Same as [`coset_lde_with_twiddles`], but also accepts pre-computed `weights[i] = offset^i / n`
-    /// so that the scaling step avoids the running product across columns.
-    /// Weights are in the base field F — the scaling `w * coeff` uses mixed F×E multiplication.
-    pub fn coset_lde_full<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
-        evals: &[FieldElement<E>],
-        blowup_factor: usize,
-        weights: &[FieldElement<F>],
-        inv_twiddles: &LayerTwiddles<F>,
-        fwd_twiddles: &LayerTwiddles<F>,
-    ) -> Result<Vec<FieldElement<E>>, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let n = evals.len();
-        if n == 0 {
-            return Ok(Vec::new());
-        }
-        let lde_size = n * blowup_factor;
-        let mut buffer = Vec::with_capacity(lde_size);
-        Self::coset_lde_full_into(
-            evals,
-            blowup_factor,
-            weights,
-            inv_twiddles,
-            fwd_twiddles,
-            &mut buffer,
-        )?;
-        Ok(buffer)
-    }
-
-    /// Compute the coset LDE into a caller-provided buffer, avoiding allocation when
-    /// `buffer.capacity() >= n * blowup_factor`.
-    ///
-    /// Same as [`coset_lde_full`], but writes into `buffer` instead of allocating a new Vec.
-    /// The buffer is cleared and reused: `buffer.clear(); buffer.extend_from_slice(evals);
-    /// buffer.resize(lde_size, zero)`. When the capacity is sufficient, no heap allocation occurs.
-    /// Weights are in the base field F — the scaling `w * coeff` uses mixed F×E multiplication.
-    pub fn coset_lde_full_into<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
-        evals: &[FieldElement<E>],
-        blowup_factor: usize,
-        weights: &[FieldElement<F>],
-        inv_twiddles: &LayerTwiddles<F>,
-        fwd_twiddles: &LayerTwiddles<F>,
-        buffer: &mut Vec<FieldElement<E>>,
-    ) -> Result<(), FFTError>
-    where
-        E: Send + Sync,
-    {
-        let n = evals.len();
-        if n == 0 {
-            buffer.clear();
-            return Ok(());
-        }
-        if !n.is_power_of_two() {
-            return Err(FFTError::InputError(n));
-        }
-        let lde_size = n * blowup_factor;
-
-        if (lde_size.trailing_zeros() as u64) > F::TWO_ADICITY {
-            return Err(FFTError::DomainSizeError(lde_size.trailing_zeros() as usize));
-        }
-
-        buffer.clear();
-        buffer.extend_from_slice(evals);
-        buffer.resize(lde_size, FieldElement::zero());
-
-        in_place_bit_reverse_permute(&mut buffer[..n]);
-        dispatch_ifft(&mut buffer[..n], inv_twiddles)?;
-
-        // Scale using pre-computed weights (base field) — F × E → E mixed multiplication.
-        for (coeff, w) in buffer[..n].iter_mut().zip(weights.iter()) {
-            *coeff = w * &*coeff;
-        }
-
-        dispatch_fft(buffer, fwd_twiddles)?;
-        in_place_bit_reverse_permute(buffer);
-
-        Ok(())
-    }
-
-    /// In-place coset LDE: the buffer already contains N evaluation points at `[0..N]`.
-    ///
-    /// This expands the buffer from N elements to `N * blowup_factor` by performing:
-    /// 1. iFFT on buffer[..N]
-    /// 2. Scale by pre-computed weights
-    /// 3. Zero-pad to N * blowup_factor
-    /// 4. Forward FFT on the full buffer
-    ///
-    /// Unlike [`coset_lde_full_into`], this skips the `clear + extend_from_slice` step
-    /// since data is already in the buffer. Used for transpose elimination: columns are
-    /// extracted directly into owned buffers, then expanded in-place.
-    pub fn coset_lde_full_expand<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
-        buffer: &mut Vec<FieldElement<E>>,
-        blowup_factor: usize,
-        weights: &[FieldElement<F>],
-        inv_twiddles: &LayerTwiddles<F>,
-        fwd_twiddles: &LayerTwiddles<F>,
-    ) -> Result<(), FFTError>
-    where
-        E: Send + Sync,
-    {
-        let n = buffer.len();
-        if n == 0 {
-            return Ok(());
-        }
-        if !n.is_power_of_two() {
-            return Err(FFTError::InputError(n));
-        }
-        let lde_size = n * blowup_factor;
-
-        if (lde_size.trailing_zeros() as u64) > F::TWO_ADICITY {
-            return Err(FFTError::DomainSizeError(lde_size.trailing_zeros() as usize));
-        }
-
-        // 1. iFFT on buffer[..n]
-        in_place_bit_reverse_permute(&mut buffer[..n]);
-        dispatch_ifft(&mut buffer[..n], inv_twiddles)?;
-
-        // 2. Scale using pre-computed weights (base field) — F × E → E mixed multiplication.
-        for (coeff, w) in buffer[..n].iter_mut().zip(weights.iter()) {
-            *coeff = w * &*coeff;
-        }
-
-        // 3. Zero-pad to lde_size
-        buffer.resize(lde_size, FieldElement::zero());
-
-        // 4. Forward FFT on the full buffer
-        dispatch_fft(buffer, fwd_twiddles)?;
-        in_place_bit_reverse_permute(buffer);
-
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-pub fn compose_fft<F, E>(
-    poly_1: &Polynomial<FieldElement<E>>,
-    poly_2: &Polynomial<FieldElement<E>>,
-) -> Polynomial<FieldElement<E>>
-where
-    F: IsFFTField + IsSubFieldOf<E>,
-    E: IsField + Send + Sync,
-{
-    let poly_2_evaluations = Polynomial::evaluate_fft::<F>(poly_2, 1, None).unwrap();
-
-    let values: Vec<_> = poly_2_evaluations
-        .iter()
-        .map(|value| poly_1.evaluate(value))
-        .collect();
-
-    Polynomial::interpolate_fft::<F>(values.as_slice()).unwrap()
-}
-
-pub fn evaluate_fft_cpu<F, E>(coeffs: &[FieldElement<E>]) -> Result<Vec<FieldElement<E>>, FFTError>
-where
-    F: IsFFTField + IsSubFieldOf<E>,
-    E: IsField + Send + Sync,
-{
-    evaluate_fft_cpu_raw::<F, E>(coeffs, true)
-}
-
-fn evaluate_fft_cpu_raw<F, E>(
-    coeffs: &[FieldElement<E>],
-    permute_to_natural: bool,
-) -> Result<Vec<FieldElement<E>>, FFTError>
-where
-    F: IsFFTField + IsSubFieldOf<E>,
-    E: IsField + Send + Sync,
-{
-    let n = coeffs.len();
-    if !n.is_power_of_two() {
-        return Err(FFTError::InputError(n));
-    }
-    let order = n.trailing_zeros() as u64;
-    let layer_twiddles =
-        LayerTwiddles::<F>::new(order).ok_or(FFTError::DomainSizeError(order as usize))?;
-
-    let mut result = coeffs.to_vec();
-    dispatch_fft(&mut result, &layer_twiddles)?;
-    if permute_to_natural {
-        in_place_bit_reverse_permute(&mut result);
-    }
-    Ok(result)
-}
-
-pub fn interpolate_fft_cpu<F, E>(
-    fft_evals: &[FieldElement<E>],
-) -> Result<Polynomial<FieldElement<E>>, FFTError>
-where
-    F: IsFFTField + IsSubFieldOf<E>,
-    E: IsField + Send + Sync,
-{
-    let n = fft_evals.len();
-    if !n.is_power_of_two() {
-        return Err(FFTError::InputError(n));
-    }
-    let order = n.trailing_zeros() as u64;
-    let inv_twiddles =
-        LayerTwiddles::<F>::new_inverse(order).ok_or(FFTError::DomainSizeError(order as usize))?;
-
-    let mut coeffs = fft_evals.to_vec();
-    // Bowers iFFT: bit-reverse first (natural → bit-reversed), then DIT inverse butterflies
-    in_place_bit_reverse_permute(&mut coeffs);
-    dispatch_ifft(&mut coeffs, &inv_twiddles)?;
-
-    // Scale by 1/n
-    let scale_factor = FieldElement::from(n as u64).inv().unwrap();
-    Ok(Polynomial::new(&coeffs).scale_coeffs(&scale_factor))
-}
-
-#[cfg(test)]
-impl<E: IsField> Polynomial<FieldElement<E>> {
-    /// Multiplies two polynomials using FFT.
-    pub fn fast_fft_multiplication<F: IsFFTField + IsSubFieldOf<E>>(
-        &self,
-        other: &Self,
-    ) -> Result<Self, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let domain_size = self.degree() + other.degree() + 1;
-        let p = Polynomial::evaluate_fft::<F>(self, 1, Some(domain_size))?;
-        let q = Polynomial::evaluate_fft::<F>(other, 1, Some(domain_size))?;
-        let r = p.into_iter().zip(q).map(|(a, b)| a * b).collect::<Vec<_>>();
-
-        Polynomial::interpolate_fft::<F>(&r)
-    }
-
-    /// Divides two polynomials with remainder using FFT.
-    pub fn fast_division<F: IsSubFieldOf<E> + IsFFTField>(
-        &self,
-        divisor: &Self,
-    ) -> Result<(Self, Self), FFTError>
-    where
-        E: Send + Sync,
-    {
-        use crate::field::errors::FieldError;
-
-        let n = self.degree();
-        let m = divisor.degree();
-        if divisor.coefficients.is_empty()
-            || divisor
-                .coefficients
-                .iter()
-                .all(|c| c == &FieldElement::zero())
-        {
-            return Err(FieldError::DivisionByZero.into());
-        }
-        if n < m {
-            return Ok((Self::zero(), self.clone()));
-        }
-        let d = n - m;
-        let a_rev = self.reverse(n);
-        let b_rev = divisor.reverse(m);
-        let inv_b_rev = b_rev.invert_polynomial_mod::<F>(d + 1)?;
-        let q = a_rev
-            .fast_fft_multiplication::<F>(&inv_b_rev)?
-            .truncate(d + 1)
-            .reverse(d);
-
-        let r = self - q.fast_fft_multiplication::<F>(divisor)?;
-        Ok((q, r))
-    }
-
-    /// Computes the inverse of polynomial P modulo x^k using Newton iteration.
-    pub fn invert_polynomial_mod<F: IsSubFieldOf<E> + IsFFTField>(
-        &self,
-        k: usize,
-    ) -> Result<Self, FFTError>
-    where
-        E: Send + Sync,
-    {
-        use crate::field::errors::FieldError;
-
-        if self.coefficients.is_empty()
-            || self.coefficients.iter().all(|c| c == &FieldElement::zero())
-        {
-            return Err(FieldError::DivisionByZero.into());
-        }
-        let mut q = Self::new(&[self.coefficients[0].inv()?]);
-        let mut current_precision = 1;
-
-        let two = Self::new(&[FieldElement::<F>::one() + FieldElement::one()]);
-        while current_precision < k {
-            current_precision *= 2;
-            let temp = self
-                .fast_fft_multiplication::<F>(&q)?
-                .truncate(current_precision);
-            let correction = &two - temp;
-            q = q
-                .fast_fft_multiplication::<F>(&correction)?
-                .truncate(current_precision);
-        }
-
-        Ok(q.truncate(k))
-    }
-}
-
-#[cfg(all(test, feature = "alloc"))]
-mod tests {
-    use super::*;
-    use crate::field::goldilocks::GoldilocksField;
-
-    type F = GoldilocksField;
-    type FE = FieldElement<F>;
-
-    #[test]
-    fn coset_lde_full_into_matches_coset_lde_full() {
-        use crate::fft::bowers_fft::LayerTwiddles;
-
-        let offset = FE::from(3u64);
-        let blowup_factor = 2;
-
-        for order in 1..=10 {
-            let n = 1usize << order;
-            let evals: Vec<FE> = (0..n).map(|i| FE::from((i * 7 + 13) as u64)).collect();
-
-            let lde_size = n * blowup_factor;
-            let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
-            let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
-
-            let n_inv = FE::from(n as u64).inv().unwrap();
-            let mut weights = Vec::with_capacity(n);
-            let mut offset_power = n_inv;
-            for _ in 0..n {
-                weights.push(offset_power);
-                offset_power = &offset_power * &offset;
-            }
-
-            let reference = Polynomial::<FE>::coset_lde_full::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-            )
-            .unwrap();
-
-            // Test with pre-allocated buffer
-            let mut buffer = Vec::with_capacity(lde_size);
-            Polynomial::<FE>::coset_lde_full_into::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-                &mut buffer,
-            )
-            .unwrap();
-
-            assert_eq!(reference, buffer, "Mismatch at order {}", order);
-        }
-    }
-
-    #[test]
-    fn coset_lde_full_into_reuses_buffer() {
-        use crate::fft::bowers_fft::LayerTwiddles;
-
-        let offset = FE::from(5u64);
-        let blowup_factor = 2usize;
-        let n = 16usize;
-        let lde_size = n * blowup_factor;
-
-        let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
-        let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
-
-        let n_inv = FE::from(n as u64).inv().unwrap();
-        let mut weights = Vec::with_capacity(n);
-        let mut offset_power = n_inv;
-        for _ in 0..n {
-            weights.push(offset_power);
-            offset_power = &offset_power * &offset;
-        }
-
-        // Pre-allocate buffer once, reuse for two different inputs
-        let mut buffer = Vec::with_capacity(lde_size);
-
-        for seed in [13u64, 42u64] {
-            let evals: Vec<FE> = (0..n).map(|i| FE::from(i as u64 * seed + 1)).collect();
-
-            let reference = Polynomial::<FE>::coset_lde_full::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-            )
-            .unwrap();
-
-            Polynomial::<FE>::coset_lde_full_into::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-                &mut buffer,
-            )
-            .unwrap();
-
-            assert_eq!(reference, buffer, "Mismatch for seed {}", seed);
-        }
-    }
-}
diff --git a/crypto/math/src/polynomial/mod.rs b/crypto/math/src/polynomial.rs
similarity index 58%
rename from crypto/math/src/polynomial/mod.rs
rename to crypto/math/src/polynomial.rs
index 098362d29..2600106a2 100644
--- a/crypto/math/src/polynomial/mod.rs
+++ b/crypto/math/src/polynomial.rs
@@ -1,5 +1,10 @@
 use super::field::element::FieldElement;
-use crate::field::traits::{IsField, IsSubFieldOf};
+use crate::fft::bit_reversing::in_place_bit_reverse_permute;
+use crate::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused, bowers_ifft_opt};
+#[cfg(feature = "parallel")]
+use crate::fft::bowers_fft::{bowers_fft_opt_fused_parallel, bowers_ifft_opt_parallel};
+use crate::fft::errors::FFTError;
+use crate::field::traits::{IsFFTField, IsField, IsSubFieldOf};
 use alloc::{borrow::ToOwned, vec, vec::Vec};
 use core::{fmt::Display, ops};
 /// Represents the polynomial c_0 + c_1 * X + c_2 * X^2 + ... + c_n * X^n
@@ -884,3 +889,553 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
         self.coefficients.pop();
     }
 }
+
+// =============================================================================
+// FFT-based polynomial methods (merged from the former fft/polynomial.rs)
+// =============================================================================
+
+/// Threshold for dispatching to parallel FFT.
+/// Below this size, sequential FFT is faster (avoids Rayon overhead).
+/// At 2^14 = 16384 elements, the parallel version starts to win
+/// because later butterfly layers have enough blocks for effective parallelism.
+#[cfg(feature = "parallel")]
+const PARALLEL_FFT_THRESHOLD: usize = 1 << 14;
+
+/// Dispatch forward FFT (DIF) to parallel or sequential implementation based on buffer size.
+#[inline]
+fn dispatch_fft<F: IsFFTField + IsSubFieldOf<E>, E: IsField + Send + Sync>(
+    buffer: &mut [FieldElement<E>],
+    twiddles: &LayerTwiddles<F>,
+) -> Result<(), FFTError> {
+    #[cfg(feature = "parallel")]
+    {
+        if buffer.len() >= PARALLEL_FFT_THRESHOLD {
+            return bowers_fft_opt_fused_parallel(buffer, twiddles);
+        }
+    }
+    bowers_fft_opt_fused(buffer, twiddles)
+}
+
+/// Dispatch inverse FFT (DIT) to parallel or sequential implementation based on buffer size.
+#[inline]
+fn dispatch_ifft<F: IsFFTField + IsSubFieldOf<E>, E: IsField + Send + Sync>(
+    buffer: &mut [FieldElement<E>],
+    twiddles: &LayerTwiddles<F>,
+) -> Result<(), FFTError> {
+    #[cfg(feature = "parallel")]
+    {
+        if buffer.len() >= PARALLEL_FFT_THRESHOLD {
+            return bowers_ifft_opt_parallel(buffer, twiddles);
+        }
+    }
+    bowers_ifft_opt(buffer, twiddles)
+}
+
+impl<E: IsField> Polynomial<FieldElement<E>> {
+    /// Returns `N` evaluations of this polynomial using FFT over a domain in a subfield F of E (so the results
+    /// are P(w^i), with w being a primitive root of unity).
+    /// `N = max(self.coeff_len(), domain_size).next_power_of_two() * blowup_factor`.
+    /// If `domain_size` is `None`, it defaults to 0.
+    pub fn evaluate_fft<F: IsFFTField + IsSubFieldOf<E>>(
+        poly: &Polynomial<FieldElement<E>>,
+        blowup_factor: usize,
+        domain_size: Option<usize>,
+    ) -> Result<Vec<FieldElement<E>>, FFTError>
+    where
+        E: Send + Sync,
+    {
+        let domain_size = domain_size.unwrap_or(0);
+        let len = core::cmp::max(poly.coeff_len(), domain_size).next_power_of_two() * blowup_factor;
+        if len.trailing_zeros() as u64 > F::TWO_ADICITY {
+            return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize));
+        }
+        if poly.coefficients().is_empty() {
+            return Ok(vec![FieldElement::zero(); len]);
+        }
+
+        let mut coeffs = poly.coefficients().to_vec();
+        coeffs.resize(len, FieldElement::zero());
+        // padding with zeros will make FFT return more evaluations of the same polynomial.
+
+        evaluate_fft_cpu::<F, E>(&coeffs)
+    }
+
+    /// Same as `evaluate_fft` but returns the evaluations in bit-reversed order,
+    /// skipping the final natural-order permutation. Use when the consumer expects
+    /// bit-reversed input (e.g. FRI commit phase, which pairs consecutive values as
+    /// {f(x), f(-x)}).
+    ///
+    /// Currently exercised only by the FFT property tests; gated to keep it out
+    /// of the production surface until a non-test caller needs it.
+    #[cfg(test)]
+    pub fn evaluate_fft_bit_reversed<F: IsFFTField + IsSubFieldOf<E>>(
+        poly: &Polynomial<FieldElement<E>>,
+        blowup_factor: usize,
+        domain_size: Option<usize>,
+    ) -> Result<Vec<FieldElement<E>>, FFTError>
+    where
+        E: Send + Sync,
+    {
+        let domain_size = domain_size.unwrap_or(0);
+        let len = core::cmp::max(poly.coeff_len(), domain_size).next_power_of_two() * blowup_factor;
+        if len.trailing_zeros() as u64 > F::TWO_ADICITY {
+            return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize));
+        }
+        if poly.coefficients().is_empty() {
+            return Ok(vec![FieldElement::zero(); len]);
+        }
+
+        let mut coeffs = poly.coefficients().to_vec();
+        coeffs.resize(len, FieldElement::zero());
+
+        evaluate_fft_cpu_raw::<F, E>(&coeffs, false)
+    }
+
+    /// Returns `N` evaluations with an offset of this polynomial using FFT over a domain in a subfield F of E
+    /// (so the results are P(w^i), with w being a primitive root of unity).
+    /// `N = max(self.coeff_len(), domain_size).next_power_of_two() * blowup_factor`.
+    /// If `domain_size` is `None`, it defaults to 0.
+    pub fn evaluate_offset_fft<F: IsFFTField + IsSubFieldOf<E>>(
+        poly: &Polynomial<FieldElement<E>>,
+        blowup_factor: usize,
+        domain_size: Option<usize>,
+        offset: &FieldElement<F>,
+    ) -> Result<Vec<FieldElement<E>>, FFTError>
+    where
+        E: Send + Sync,
+    {
+        let scaled = poly.scale(offset);
+        Polynomial::evaluate_fft::<F>(&scaled, blowup_factor, domain_size)
+    }
+
+    /// Returns a new polynomial that interpolates `(w^i, fft_evals[i])`, with `w` being a
+    /// Nth primitive root of unity in a subfield F of E, and `i in 0..N`, with `N = fft_evals.len()`.
+    /// This is considered to be the inverse operation of [Self::evaluate_fft()].
+    pub fn interpolate_fft<F: IsFFTField + IsSubFieldOf<E>>(
+        fft_evals: &[FieldElement<E>],
+    ) -> Result<Self, FFTError>
+    where
+        E: Send + Sync,
+    {
+        interpolate_fft_cpu::<F, E>(fft_evals)
+    }
+
+    /// Returns a new polynomial that interpolates offset `(w^i, fft_evals[i])`, with `w` being a
+    /// Nth primitive root of unity in a subfield F of E, and `i in 0..N`, with `N = fft_evals.len()`.
+    /// This is considered to be the inverse operation of [Self::evaluate_offset_fft()].
+    pub fn interpolate_offset_fft<F: IsFFTField + IsSubFieldOf<E>>(
+        fft_evals: &[FieldElement<E>],
+        offset: &FieldElement<F>,
+    ) -> Result<Polynomial<FieldElement<E>>, FFTError>
+    where
+        E: Send + Sync,
+    {
+        let scaled = Polynomial::interpolate_fft::<F>(fft_evals)?;
+        Ok(scaled.scale(&offset.inv().unwrap()))
+    }
+
+    /// Compute the coset LDE with pre-computed twiddle factors and pre-computed weights.
+    ///
+    /// Same as [`coset_lde_with_twiddles`], but also accepts pre-computed `weights[i] = offset^i / n`
+    /// so that the scaling step avoids the running product across columns.
+    /// Weights are in the base field F — the scaling `w * coeff` uses mixed F×E multiplication.
+    pub fn coset_lde_full<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
+        evals: &[FieldElement<E>],
+        blowup_factor: usize,
+        weights: &[FieldElement<F>],
+        inv_twiddles: &LayerTwiddles<F>,
+        fwd_twiddles: &LayerTwiddles<F>,
+    ) -> Result<Vec<FieldElement<E>>, FFTError>
+    where
+        E: Send + Sync,
+    {
+        let n = evals.len();
+        if n == 0 {
+            return Ok(Vec::new());
+        }
+        let lde_size = n * blowup_factor;
+        let mut buffer = Vec::with_capacity(lde_size);
+        Self::coset_lde_full_into(
+            evals,
+            blowup_factor,
+            weights,
+            inv_twiddles,
+            fwd_twiddles,
+            &mut buffer,
+        )?;
+        Ok(buffer)
+    }
+
+    /// Compute the coset LDE into a caller-provided buffer, avoiding allocation when
+    /// `buffer.capacity() >= n * blowup_factor`.
+    ///
+    /// Same as [`coset_lde_full`], but writes into `buffer` instead of allocating a new Vec.
+    /// The buffer is cleared and reused: `buffer.clear(); buffer.extend_from_slice(evals);
+    /// buffer.resize(lde_size, zero)`. When the capacity is sufficient, no heap allocation occurs.
+    /// Weights are in the base field F — the scaling `w * coeff` uses mixed F×E multiplication.
+    pub fn coset_lde_full_into<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
+        evals: &[FieldElement<E>],
+        blowup_factor: usize,
+        weights: &[FieldElement<F>],
+        inv_twiddles: &LayerTwiddles<F>,
+        fwd_twiddles: &LayerTwiddles<F>,
+        buffer: &mut Vec<FieldElement<E>>,
+    ) -> Result<(), FFTError>
+    where
+        E: Send + Sync,
+    {
+        let n = evals.len();
+        if n == 0 {
+            buffer.clear();
+            return Ok(());
+        }
+        if !n.is_power_of_two() {
+            return Err(FFTError::InputError(n));
+        }
+        let lde_size = n * blowup_factor;
+
+        if (lde_size.trailing_zeros() as u64) > F::TWO_ADICITY {
+            return Err(FFTError::DomainSizeError(lde_size.trailing_zeros() as usize));
+        }
+
+        buffer.clear();
+        buffer.extend_from_slice(evals);
+        buffer.resize(lde_size, FieldElement::zero());
+
+        in_place_bit_reverse_permute(&mut buffer[..n]);
+        dispatch_ifft(&mut buffer[..n], inv_twiddles)?;
+
+        // Scale using pre-computed weights (base field) — F × E → E mixed multiplication.
+        for (coeff, w) in buffer[..n].iter_mut().zip(weights.iter()) {
+            *coeff = w * &*coeff;
+        }
+
+        dispatch_fft(buffer, fwd_twiddles)?;
+        in_place_bit_reverse_permute(buffer);
+
+        Ok(())
+    }
+
+    /// In-place coset LDE: the buffer already contains N evaluation points at `[0..N]`.
+    ///
+    /// This expands the buffer from N elements to `N * blowup_factor` by performing:
+    /// 1. iFFT on buffer[..N]
+    /// 2. Scale by pre-computed weights
+    /// 3. Zero-pad to N * blowup_factor
+    /// 4. Forward FFT on the full buffer
+    ///
+    /// Unlike [`coset_lde_full_into`], this skips the `clear + extend_from_slice` step
+    /// since data is already in the buffer. Used for transpose elimination: columns are
+    /// extracted directly into owned buffers, then expanded in-place.
+    pub fn coset_lde_full_expand<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
+        buffer: &mut Vec<FieldElement<E>>,
+        blowup_factor: usize,
+        weights: &[FieldElement<F>],
+        inv_twiddles: &LayerTwiddles<F>,
+        fwd_twiddles: &LayerTwiddles<F>,
+    ) -> Result<(), FFTError>
+    where
+        E: Send + Sync,
+    {
+        let n = buffer.len();
+        if n == 0 {
+            return Ok(());
+        }
+        if !n.is_power_of_two() {
+            return Err(FFTError::InputError(n));
+        }
+        let lde_size = n * blowup_factor;
+
+        if (lde_size.trailing_zeros() as u64) > F::TWO_ADICITY {
+            return Err(FFTError::DomainSizeError(lde_size.trailing_zeros() as usize));
+        }
+
+        // 1. iFFT on buffer[..n]
+        in_place_bit_reverse_permute(&mut buffer[..n]);
+        dispatch_ifft(&mut buffer[..n], inv_twiddles)?;
+
+        // 2. Scale using pre-computed weights (base field) — F × E → E mixed multiplication.
+        for (coeff, w) in buffer[..n].iter_mut().zip(weights.iter()) {
+            *coeff = w * &*coeff;
+        }
+
+        // 3. Zero-pad to lde_size
+        buffer.resize(lde_size, FieldElement::zero());
+
+        // 4. Forward FFT on the full buffer
+        dispatch_fft(buffer, fwd_twiddles)?;
+        in_place_bit_reverse_permute(buffer);
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+pub fn compose_fft<F, E>(
+    poly_1: &Polynomial<FieldElement<E>>,
+    poly_2: &Polynomial<FieldElement<E>>,
+) -> Polynomial<FieldElement<E>>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+{
+    let poly_2_evaluations = Polynomial::evaluate_fft::<F>(poly_2, 1, None).unwrap();
+
+    let values: Vec<_> = poly_2_evaluations
+        .iter()
+        .map(|value| poly_1.evaluate(value))
+        .collect();
+
+    Polynomial::interpolate_fft::<F>(values.as_slice()).unwrap()
+}
+
+pub fn evaluate_fft_cpu<F, E>(coeffs: &[FieldElement<E>]) -> Result<Vec<FieldElement<E>>, FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+{
+    evaluate_fft_cpu_raw::<F, E>(coeffs, true)
+}
+
+fn evaluate_fft_cpu_raw<F, E>(
+    coeffs: &[FieldElement<E>],
+    permute_to_natural: bool,
+) -> Result<Vec<FieldElement<E>>, FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+{
+    let n = coeffs.len();
+    if !n.is_power_of_two() {
+        return Err(FFTError::InputError(n));
+    }
+    let order = n.trailing_zeros() as u64;
+    let layer_twiddles =
+        LayerTwiddles::<F>::new(order).ok_or(FFTError::DomainSizeError(order as usize))?;
+
+    let mut result = coeffs.to_vec();
+    dispatch_fft(&mut result, &layer_twiddles)?;
+    if permute_to_natural {
+        in_place_bit_reverse_permute(&mut result);
+    }
+    Ok(result)
+}
+
+pub fn interpolate_fft_cpu<F, E>(
+    fft_evals: &[FieldElement<E>],
+) -> Result<Polynomial<FieldElement<E>>, FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+{
+    let n = fft_evals.len();
+    if !n.is_power_of_two() {
+        return Err(FFTError::InputError(n));
+    }
+    let order = n.trailing_zeros() as u64;
+    let inv_twiddles =
+        LayerTwiddles::<F>::new_inverse(order).ok_or(FFTError::DomainSizeError(order as usize))?;
+
+    let mut coeffs = fft_evals.to_vec();
+    // Bowers iFFT: bit-reverse first (natural → bit-reversed), then DIT inverse butterflies
+    in_place_bit_reverse_permute(&mut coeffs);
+    dispatch_ifft(&mut coeffs, &inv_twiddles)?;
+
+    // Scale by 1/n
+    let scale_factor = FieldElement::from(n as u64).inv().unwrap();
+    Ok(Polynomial::new(&coeffs).scale_coeffs(&scale_factor))
+}
+
+#[cfg(test)]
+impl<E: IsField> Polynomial<FieldElement<E>> {
+    /// Multiplies two polynomials using FFT.
+    pub fn fast_fft_multiplication<F: IsFFTField + IsSubFieldOf<E>>(
+        &self,
+        other: &Self,
+    ) -> Result<Self, FFTError>
+    where
+        E: Send + Sync,
+    {
+        let domain_size = self.degree() + other.degree() + 1;
+        let p = Polynomial::evaluate_fft::<F>(self, 1, Some(domain_size))?;
+        let q = Polynomial::evaluate_fft::<F>(other, 1, Some(domain_size))?;
+        let r = p.into_iter().zip(q).map(|(a, b)| a * b).collect::<Vec<_>>();
+
+        Polynomial::interpolate_fft::<F>(&r)
+    }
+
+    /// Divides two polynomials with remainder using FFT.
+    pub fn fast_division<F: IsSubFieldOf<E> + IsFFTField>(
+        &self,
+        divisor: &Self,
+    ) -> Result<(Self, Self), FFTError>
+    where
+        E: Send + Sync,
+    {
+        use crate::field::errors::FieldError;
+
+        let n = self.degree();
+        let m = divisor.degree();
+        if divisor.coefficients.is_empty()
+            || divisor
+                .coefficients
+                .iter()
+                .all(|c| c == &FieldElement::zero())
+        {
+            return Err(FieldError::DivisionByZero.into());
+        }
+        if n < m {
+            return Ok((Self::zero(), self.clone()));
+        }
+        let d = n - m;
+        let a_rev = self.reverse(n);
+        let b_rev = divisor.reverse(m);
+        let inv_b_rev = b_rev.invert_polynomial_mod::<F>(d + 1)?;
+        let q = a_rev
+            .fast_fft_multiplication::<F>(&inv_b_rev)?
+            .truncate(d + 1)
+            .reverse(d);
+
+        let r = self - q.fast_fft_multiplication::<F>(divisor)?;
+        Ok((q, r))
+    }
+
+    /// Computes the inverse of polynomial P modulo x^k using Newton iteration.
+    pub fn invert_polynomial_mod<F: IsSubFieldOf<E> + IsFFTField>(
+        &self,
+        k: usize,
+    ) -> Result<Self, FFTError>
+    where
+        E: Send + Sync,
+    {
+        use crate::field::errors::FieldError;
+
+        if self.coefficients.is_empty()
+            || self.coefficients.iter().all(|c| c == &FieldElement::zero())
+        {
+            return Err(FieldError::DivisionByZero.into());
+        }
+        let mut q = Self::new(&[self.coefficients[0].inv()?]);
+        let mut current_precision = 1;
+
+        let two = Self::new(&[FieldElement::<F>::one() + FieldElement::one()]);
+        while current_precision < k {
+            current_precision *= 2;
+            let temp = self
+                .fast_fft_multiplication::<F>(&q)?
+                .truncate(current_precision);
+            let correction = &two - temp;
+            q = q
+                .fast_fft_multiplication::<F>(&correction)?
+                .truncate(current_precision);
+        }
+
+        Ok(q.truncate(k))
+    }
+}
+
+#[cfg(all(test, feature = "alloc"))]
+mod tests {
+    use super::*;
+    use crate::field::goldilocks::GoldilocksField;
+
+    type F = GoldilocksField;
+    type FE = FieldElement<F>;
+
+    #[test]
+    fn coset_lde_full_into_matches_coset_lde_full() {
+        use crate::fft::bowers_fft::LayerTwiddles;
+
+        let offset = FE::from(3u64);
+        let blowup_factor = 2;
+
+        for order in 1..=10 {
+            let n = 1usize << order;
+            let evals: Vec<FE> = (0..n).map(|i| FE::from((i * 7 + 13) as u64)).collect();
+
+            let lde_size = n * blowup_factor;
+            let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
+            let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
+
+            let n_inv = FE::from(n as u64).inv().unwrap();
+            let mut weights = Vec::with_capacity(n);
+            let mut offset_power = n_inv;
+            for _ in 0..n {
+                weights.push(offset_power);
+                offset_power = &offset_power * &offset;
+            }
+
+            let reference = Polynomial::<FE>::coset_lde_full::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+            )
+            .unwrap();
+
+            // Test with pre-allocated buffer
+            let mut buffer = Vec::with_capacity(lde_size);
+            Polynomial::<FE>::coset_lde_full_into::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+                &mut buffer,
+            )
+            .unwrap();
+
+            assert_eq!(reference, buffer, "Mismatch at order {}", order);
+        }
+    }
+
+    #[test]
+    fn coset_lde_full_into_reuses_buffer() {
+        use crate::fft::bowers_fft::LayerTwiddles;
+
+        let offset = FE::from(5u64);
+        let blowup_factor = 2usize;
+        let n = 16usize;
+        let lde_size = n * blowup_factor;
+
+        let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
+        let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
+
+        let n_inv = FE::from(n as u64).inv().unwrap();
+        let mut weights = Vec::with_capacity(n);
+        let mut offset_power = n_inv;
+        for _ in 0..n {
+            weights.push(offset_power);
+            offset_power = &offset_power * &offset;
+        }
+
+        // Pre-allocate buffer once, reuse for two different inputs
+        let mut buffer = Vec::with_capacity(lde_size);
+
+        for seed in [13u64, 42u64] {
+            let evals: Vec<FE> = (0..n).map(|i| FE::from(i as u64 * seed + 1)).collect();
+
+            let reference = Polynomial::<FE>::coset_lde_full::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+            )
+            .unwrap();
+
+            Polynomial::<FE>::coset_lde_full_into::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+                &mut buffer,
+            )
+            .unwrap();
+
+            assert_eq!(reference, buffer, "Mismatch for seed {}", seed);
+        }
+    }
+}
diff --git a/crypto/math/src/polynomial/README.md b/crypto/math/src/polynomial/README.md
deleted file mode 100644
index ad8f22681..000000000
--- a/crypto/math/src/polynomial/README.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# lambdaworks Polynomials
-
-Contains all the relevant tools for polynomials. Supports:
-- [Univariate polynomials](./mod.rs)
-- [Dense Multivariate polynomials](../polynomial/dense_multilinear_poly.rs) and [Sparse Multilinear polynomials](../polynomial/sparse_multilinear_poly.rs)
-
-lambdaworks's polynomials work over [Finite Fields](../field/README.md).
-
-## Univariate polynomials
-
-Univariate polynomials are expressions of the form $p(x) = a_0 + a_1 x + a_2 x^2 + ... + a_n x^n$, where $x$ is the indeterminate and $a_0, a_1 , ... , a_n$ take values over a finite field. The power with the highest non-zero coefficient is called the degree of the polynomial. A univariate polynomial is represented by means of the following struct:
-```rust
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Polynomial<FE> {
-    pub coefficients: Vec<FE>,
-}
-```
-it contains the coefficients in increasing order (we start with the independent term, $a_0$, then $a_1$, and so on and so forth). To create a new polynomial,
-```rust
-let my_poly = Polynomial::new(&[FE::new(1), FE::new(2), FE::new(3)])
-```
-This creates the polynomial $p(x) = 1 + 2 x + 3 x^2$. If we provide additional zeros to the right, the `new` method will remove those unnecessary zeros. For example,
-```rust
-let my_poly = Polynomial::new(&[FE::new(1), FE::new(2), FE::new(3), FE::ZERO])
-```
-generates the same polynomial as before. We can also create a monomial, such as $5 x^4$ or $27 x^{120}$, which can be simpler sometimes (instead of providing a long list of zeros). To define a monomial, simply
-```rust
-let my_monomial = Polynomial::new_monomial(FE::new(27),6)
-```
-generates the monomial $p(x) = 27 x^6$, which has a representation as polynomial $(0,0,0,0,0,0,27)$.
-
-Univariate polynomials have a [ring structure](https://en.wikipedia.org/wiki/Ring_(mathematics)): we can add, subtract, multiply and divide as we did with integers. For example, to add two polynomials,
-```rust
-let p_1 = Polynomial::new(&[FE::new(3), FE::new(4), FE::new(5)])
-let p_2 = Polynomial::new(&[FE::new(4), FE::new(6), FE::new(8)])
-let p_a = p_1 + p_2
-```
-Polynomial multiplication,
-```rust
-let p1 = Polynomial::new(&[FE::new(3), FE::new(3), FE::new(2)]);
-let p2 = Polynomial::new(&[FE::new(4), FE::new(1)]);
-assert_eq!(
-    p2 * p1,
-    Polynomial::new(&[FE::new(12), FE::new(15), FE::new(11), FE::new(2)])
-    );
-```
-Division,
-```rust
-let p1 = Polynomial::new(&[FE::new(1), FE::new(3)]);
-let p2 = Polynomial::new(&[FE::new(1), FE::new(3)]);
-let p3 = p1.mul_with_ref(&p2);
-assert_eq!(p3 / p2, p1);
-```
-Note that, in the case of polynomial division, it may have a remainder. If you want to divide a polynomial $p(x)$ by $x - b$, you can use faster alternatives, such as `ruffini_division` or `ruffini_division_inplace`.
-
-Polynomials can also be evaluated at a point $x_0$ using `evaluate`. This provides the evaluation $p( x_0 ) = a_0 + a_1 x_0 + a_2 x_0^2 + ... + a_n x_0^n$. For example,
-```rust
-let p = Polynomial::new(&[FE::new(3), -FE::new(2), FE::new(4)]);
-assert_eq!(p.evaluate(&FE::new(2)), FE::new(15));
-```
-evaluates the polynomial $p(x) = 3 - 2 x + 4 x^2$ at $2$ to yield $15$. If you need to evaluate at several points, you can use `evaluate_slice`.
-
-Alternatively, polynomials of degree $n$ can be defined by providing exactly $n + 1$ evaluations. For example, $p(1) = 1$ and $p(0) = 2$ defines a unique polynomial of degree $1$, $p(x) = 2 - x$. To obtain the coefficients of $p(x)$ we need to use the function `interpolate`, which takes to vectors, of equal length: the first contains the $x$ coordinates $(0,1)$ and the second, the $y$ components $(2,1)$ (note that we have to provide the evaluation points in the same order as their corresponding evaluations):
-```rust
-let p = Polynomial::interpolate(&[FE::new(0), FE::new(1)], &[FE::new(2), FE::new(1)]).unwrap();
-```
-
-Many polynomial operations can go faster by using the [Fast Fourier Transform](../fft/polynomial.rs).
-
-## Multilinear polynomials
-
-Multilinear polynomials are useful to define multilinear extensions of functions, which then play an important role in proof systems involving the [sumcheck protocol](../../../provers/sumcheck/README.md). There are two ways to define multilinear polynomials:
-- Dense
-- Sparse
-
-Sparse is more convenient whenever the number of non-zero coefficients in the polynomial is small (compared to the length of the polynomial), avoiding the storage of unnecessary zeros. For dense multilinear polynomials we have the following structure, working over some field $F$:
-```rust
-pub struct DenseMultilinearPolynomial<F: IsField>
-where
-    <F as IsField>::BaseType: Send + Sync,
-{
-    evals: Vec<FieldElement<F>>,
-    n_vars: usize,
-    len: usize,
-}
-```
-The polynomial is assumed to be given in evaluation form over the binary strings of length $\{0 , 1 \}^{n_{vars}}$. We can also interpret this as the coefficients of the polynomial with respect to the Lagrange basis polynomials over $\{0 , 1 \}^{n_{vars}}$. There are $2^{n_{vars}}$ Lagrange polynomials, given by the formula:
-$L_k (x_0 , x_1 , ... , x_{n_{vars} - 1}) = \prod (x_j b_{kj} + (1 - x_j ) (1 - b_{kj} ))$
-where $b_{kj}$ are given by the binary decomposition of $k$, that is $k = \sum_j b_{kj} 2^j$. We can see that each such polynomial is equal to one over $\{b_{k0}, b_{k1} , ... b_{k (n_{vars} - 1)}}$ and zero for any other element in $\{0 , 1 \}^{n_{vars}}$. The polynomial is thus defined as
-$p (x_0 , x_1, ... , x_{n_{vars} - 1} ) = \sum_k p(b_{k0}, b_{k1} , ... , b_{k (n_{vars} - 1)}) L_k (x_0 , x_1, ... , x_{n_{vars} - 1} )$
-Sometimes, we will use $L_k (j)$ to refer to the evaluation of $L_k$ at the binary decomposition of $j$, that is $j = \sum_k b_{k}2^k$.
-
-An advantage of Lagrange basis polynomials is that we can evaluate all $2^{n_{vars}}$ polynomials at a point $(r_0 , r_1 ... , r_{n_{vars} - 1})$ in $\mathcal{O}(2^{n_{vars}})$ operations (linear in the size of the number of polynomials). Refer to [Thaler's book](https://people.cs.georgetown.edu/jthaler/ProofsArgsAndZK.pdf) for more information.
-
-To create a new polynomial, provide a list of evaluations of $p$; the length of this list should be a power of 2.
-```rust
-pub fn new(mut evals: Vec<FieldElement<F>>) -> Self {
-    while !evals.len().is_power_of_two() {
-        evals.push(FieldElement::zero());
-    }
-    let len = evals.len();
-    DenseMultilinearPolynomial {
-        n_vars: log_2(len),
-        evals,
-        len,
-    }
-}
-```
-
-Dense multilinear polynomials allow you to access the fields `n_vars`, `len` and `evals` with the methods `pub fn num_vars(&self) -> usize`, `pub fn len(&self) -> usize` and `pub fn evals(&self) -> &Vec<FieldElement<F>>`.
-
-If you want to evaluate outside $\{0 , 1 \}^{n_{vars}}$, you can use the functions `pub fn evaluate(&self, r: Vec<FieldElement<F>>)` and `pub fn evaluate_with(evals: &[FieldElement<F>], r: &[FieldElement<F>])`, providing the point $r$ whose length must be $n_{vars}$. For evaluations over $\{0 , 1 \}^{n_{vars}}$, you can get the value directly from the list of evaluations defining the polynomial. For example, 
-```rust
-// Example: Z = [1, 2, 1, 4]
-let z = vec![FE::one(), FE::from(2u64), FE::one(), FE::from(4u64)];
-// r = [4, 3]
-let r = vec![FE::from(4u64), FE::from(3u64)];
-let eval_with_lr = evaluate_with_lr(&z, &r);
-let poly = DenseMultilinearPolynomial::new(z);
-let eval = poly.evaluate(r).unwrap();
-assert_eq!(eval, FE::from(28u64));
-```
-
-An important functionality is `pub fn to_univariate(&self) -> Polynomial<FieldElement<F>>`, which converts a multilinear polynomial into a univariate polynomial, by summing over all variables over $\{0 , 1 \}^{n_{vars} - 1}$, leaving $x_{n_{vars} - 1}$ as the only variable, 
-$$f(x) = \sum_{(x_0 , x_1, ... , x_{n_{vars} - 2} ) \in \{0 , 1 \}^{n_{vars} - 1}} p(x_0 , x_1, ... , x_{n_{vars} - 2} , x)$$. For example,
-```rust
-let univar0 = prover.poly.to_univariate();
-```
-is used in the sumcheck protocol.
-
-Multilinear polynomials can be added and multiplied by scalars.
\ No newline at end of file
diff --git a/crypto/math/src/tests/fft_tests.rs b/crypto/math/src/tests/fft_tests.rs
index 2427d64ae..1319947a7 100644
--- a/crypto/math/src/tests/fft_tests.rs
+++ b/crypto/math/src/tests/fft_tests.rs
@@ -52,7 +52,7 @@ mod fft_polynomial_tests {
     use crate::fft::roots_of_unity::{
         get_powers_of_primitive_root, get_powers_of_primitive_root_coset,
     };
-    use crate::fft::polynomial::compose_fft;
+    use crate::polynomial::compose_fft;
     use crate::field::element::FieldElement;
     use crate::field::extensions_goldilocks::Degree2GoldilocksExtensionField;
     use crate::field::traits::{IsFFTField, RootsConfig};

From 08e50696777b8602a8c653e58da865580cd43201 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Wed, 20 May 2026 10:36:53 -0300
Subject: [PATCH 07/10] refactor(math): consolidate scattered tests into tests/
 directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three test conventions coexisted: the `tests/` directory, a sibling
`bowers_fft_tests.rs` next to source, and an inline `#[cfg(test)] mod`
in the polynomial module. Unify on the `tests/` directory.

- Move `fft/bowers_fft_tests.rs` -> `tests/bowers_fft_tests.rs`, register
  it in `tests/mod.rs`, drop the `mod bowers_fft_tests;` from `fft/mod.rs`.
- Move the inline coset-LDE `mod tests` out of `polynomial.rs` into a new
  `coset_lde_tests` module in `tests/fft_tests.rs`.
- Rename `fft/fft.rs` -> `fft/reference_fft.rs`: after the Step 6 flatten
  it collided with its own parent module name (`fft::fft`), which clippy
  rejects. The new name also says what it is — the reference radix-2 FFT
  used only to cross-check the production Bowers FFT. Gate the module
  `#[cfg(test)] pub(crate)` since its whole content is test-only.

216 math tests pass (same count — no tests lost in the move).
---
 crypto/math/src/fft/mod.rs                    |   6 +-
 .../math/src/fft/{fft.rs => reference_fft.rs} |   0
 crypto/math/src/polynomial.rs                 | 106 -----------------
 .../src/{fft => tests}/bowers_fft_tests.rs    |   4 +-
 crypto/math/src/tests/fft_tests.rs            | 107 +++++++++++++++++-
 crypto/math/src/tests/mod.rs                  |   1 +
 6 files changed, 112 insertions(+), 112 deletions(-)
 rename crypto/math/src/fft/{fft.rs => reference_fft.rs} (100%)
 rename crypto/math/src/{fft => tests}/bowers_fft_tests.rs (99%)

diff --git a/crypto/math/src/fft/mod.rs b/crypto/math/src/fft/mod.rs
index ee8b6bd73..70adcf6ed 100644
--- a/crypto/math/src/fft/mod.rs
+++ b/crypto/math/src/fft/mod.rs
@@ -2,12 +2,12 @@ pub mod bit_reversing;
 #[cfg(feature = "alloc")]
 pub mod bowers_fft;
 pub mod errors;
-pub mod fft;
 #[cfg(feature = "alloc")]
 pub mod roots_of_unity;
 
-#[cfg(all(test, feature = "alloc"))]
-mod bowers_fft_tests;
+/// Reference radix-2 FFT, used only to cross-check the production Bowers FFT.
+#[cfg(test)]
+pub(crate) mod reference_fft;
 
 #[cfg(all(test, feature = "alloc"))]
 pub(crate) mod test_helpers;
diff --git a/crypto/math/src/fft/fft.rs b/crypto/math/src/fft/reference_fft.rs
similarity index 100%
rename from crypto/math/src/fft/fft.rs
rename to crypto/math/src/fft/reference_fft.rs
diff --git a/crypto/math/src/polynomial.rs b/crypto/math/src/polynomial.rs
index 2600106a2..b6fb5f15a 100644
--- a/crypto/math/src/polynomial.rs
+++ b/crypto/math/src/polynomial.rs
@@ -1333,109 +1333,3 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
         Ok(q.truncate(k))
     }
 }
-
-#[cfg(all(test, feature = "alloc"))]
-mod tests {
-    use super::*;
-    use crate::field::goldilocks::GoldilocksField;
-
-    type F = GoldilocksField;
-    type FE = FieldElement<F>;
-
-    #[test]
-    fn coset_lde_full_into_matches_coset_lde_full() {
-        use crate::fft::bowers_fft::LayerTwiddles;
-
-        let offset = FE::from(3u64);
-        let blowup_factor = 2;
-
-        for order in 1..=10 {
-            let n = 1usize << order;
-            let evals: Vec<FE> = (0..n).map(|i| FE::from((i * 7 + 13) as u64)).collect();
-
-            let lde_size = n * blowup_factor;
-            let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
-            let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
-
-            let n_inv = FE::from(n as u64).inv().unwrap();
-            let mut weights = Vec::with_capacity(n);
-            let mut offset_power = n_inv;
-            for _ in 0..n {
-                weights.push(offset_power);
-                offset_power = &offset_power * &offset;
-            }
-
-            let reference = Polynomial::<FE>::coset_lde_full::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-            )
-            .unwrap();
-
-            // Test with pre-allocated buffer
-            let mut buffer = Vec::with_capacity(lde_size);
-            Polynomial::<FE>::coset_lde_full_into::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-                &mut buffer,
-            )
-            .unwrap();
-
-            assert_eq!(reference, buffer, "Mismatch at order {}", order);
-        }
-    }
-
-    #[test]
-    fn coset_lde_full_into_reuses_buffer() {
-        use crate::fft::bowers_fft::LayerTwiddles;
-
-        let offset = FE::from(5u64);
-        let blowup_factor = 2usize;
-        let n = 16usize;
-        let lde_size = n * blowup_factor;
-
-        let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
-        let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
-
-        let n_inv = FE::from(n as u64).inv().unwrap();
-        let mut weights = Vec::with_capacity(n);
-        let mut offset_power = n_inv;
-        for _ in 0..n {
-            weights.push(offset_power);
-            offset_power = &offset_power * &offset;
-        }
-
-        // Pre-allocate buffer once, reuse for two different inputs
-        let mut buffer = Vec::with_capacity(lde_size);
-
-        for seed in [13u64, 42u64] {
-            let evals: Vec<FE> = (0..n).map(|i| FE::from(i as u64 * seed + 1)).collect();
-
-            let reference = Polynomial::<FE>::coset_lde_full::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-            )
-            .unwrap();
-
-            Polynomial::<FE>::coset_lde_full_into::<F>(
-                &evals,
-                blowup_factor,
-                &weights,
-                &inv_tw,
-                &fwd_tw,
-                &mut buffer,
-            )
-            .unwrap();
-
-            assert_eq!(reference, buffer, "Mismatch for seed {}", seed);
-        }
-    }
-}
diff --git a/crypto/math/src/fft/bowers_fft_tests.rs b/crypto/math/src/tests/bowers_fft_tests.rs
similarity index 99%
rename from crypto/math/src/fft/bowers_fft_tests.rs
rename to crypto/math/src/tests/bowers_fft_tests.rs
index 396f85f9b..70a7107b8 100644
--- a/crypto/math/src/fft/bowers_fft_tests.rs
+++ b/crypto/math/src/tests/bowers_fft_tests.rs
@@ -2,9 +2,9 @@
 //!
 //! Separated from the main implementation to keep the library code concise.
 
-use super::bowers_fft::*;
 use crate::fft::bit_reversing::in_place_bit_reverse_permute;
-use crate::fft::fft::in_place_nr_2radix_fft;
+use crate::fft::bowers_fft::*;
+use crate::fft::reference_fft::in_place_nr_2radix_fft;
 use crate::fft::roots_of_unity::get_powers_of_primitive_root;
 use crate::field::element::FieldElement;
 use crate::field::goldilocks::GoldilocksField;
diff --git a/crypto/math/src/tests/fft_tests.rs b/crypto/math/src/tests/fft_tests.rs
index 1319947a7..61fc48c76 100644
--- a/crypto/math/src/tests/fft_tests.rs
+++ b/crypto/math/src/tests/fft_tests.rs
@@ -52,11 +52,11 @@ mod fft_polynomial_tests {
     use crate::fft::roots_of_unity::{
         get_powers_of_primitive_root, get_powers_of_primitive_root_coset,
     };
-    use crate::polynomial::compose_fft;
     use crate::field::element::FieldElement;
     use crate::field::extensions_goldilocks::Degree2GoldilocksExtensionField;
     use crate::field::traits::{IsFFTField, RootsConfig};
     use crate::polynomial::Polynomial;
+    use crate::polynomial::compose_fft;
     use proptest::{collection, prelude::*};
 
     /// Evaluates a polynomial at a slice of points
@@ -332,3 +332,108 @@ mod roots_of_unity_tests {
         assert!(result.is_err());
     }
 }
+
+#[cfg(all(test, feature = "alloc"))]
+mod coset_lde_tests {
+    use crate::fft::bowers_fft::LayerTwiddles;
+    use crate::field::element::FieldElement;
+    use crate::field::goldilocks::GoldilocksField;
+    use crate::polynomial::Polynomial;
+    use alloc::vec::Vec;
+
+    type F = GoldilocksField;
+    type FE = FieldElement<F>;
+
+    #[test]
+    fn coset_lde_full_into_matches_coset_lde_full() {
+        let offset = FE::from(3u64);
+        let blowup_factor = 2;
+
+        for order in 1..=10 {
+            let n = 1usize << order;
+            let evals: Vec<FE> = (0..n).map(|i| FE::from((i * 7 + 13) as u64)).collect();
+
+            let lde_size = n * blowup_factor;
+            let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
+            let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
+
+            let n_inv = FE::from(n as u64).inv().unwrap();
+            let mut weights = Vec::with_capacity(n);
+            let mut offset_power = n_inv;
+            for _ in 0..n {
+                weights.push(offset_power);
+                offset_power = &offset_power * &offset;
+            }
+
+            let reference = Polynomial::<FE>::coset_lde_full::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+            )
+            .unwrap();
+
+            // Test with pre-allocated buffer
+            let mut buffer = Vec::with_capacity(lde_size);
+            Polynomial::<FE>::coset_lde_full_into::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+                &mut buffer,
+            )
+            .unwrap();
+
+            assert_eq!(reference, buffer, "Mismatch at order {}", order);
+        }
+    }
+
+    #[test]
+    fn coset_lde_full_into_reuses_buffer() {
+        let offset = FE::from(5u64);
+        let blowup_factor = 2usize;
+        let n = 16usize;
+        let lde_size = n * blowup_factor;
+
+        let inv_tw = LayerTwiddles::<F>::new_inverse(n.trailing_zeros() as u64).unwrap();
+        let fwd_tw = LayerTwiddles::<F>::new(lde_size.trailing_zeros() as u64).unwrap();
+
+        let n_inv = FE::from(n as u64).inv().unwrap();
+        let mut weights = Vec::with_capacity(n);
+        let mut offset_power = n_inv;
+        for _ in 0..n {
+            weights.push(offset_power);
+            offset_power = &offset_power * &offset;
+        }
+
+        // Pre-allocate buffer once, reuse for two different inputs
+        let mut buffer = Vec::with_capacity(lde_size);
+
+        for seed in [13u64, 42u64] {
+            let evals: Vec<FE> = (0..n).map(|i| FE::from(i as u64 * seed + 1)).collect();
+
+            let reference = Polynomial::<FE>::coset_lde_full::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+            )
+            .unwrap();
+
+            Polynomial::<FE>::coset_lde_full_into::<F>(
+                &evals,
+                blowup_factor,
+                &weights,
+                &inv_tw,
+                &fwd_tw,
+                &mut buffer,
+            )
+            .unwrap();
+
+            assert_eq!(reference, buffer, "Mismatch for seed {}", seed);
+        }
+    }
+}
diff --git a/crypto/math/src/tests/mod.rs b/crypto/math/src/tests/mod.rs
index e44aeb56c..08403ac00 100644
--- a/crypto/math/src/tests/mod.rs
+++ b/crypto/math/src/tests/mod.rs
@@ -1,4 +1,5 @@
 pub mod barycentric_tests;
+pub mod bowers_fft_tests;
 pub mod fft_friendly_extensions_goldilocks_tests;
 pub mod fft_friendly_u64_goldilocks_tests;
 pub mod fft_tests;

From 48762cf3438b2b590118f2b54863a95865d36838 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 22 May 2026 10:14:24 -0300
Subject: [PATCH 08/10] refactor(math): drop redundant FFT cross-check code,
 relocate inline tests

The reference radix-2 FFT existed only to cross-check the production Bowers
FFT. Bowers is already verified directly against a naive O(n^2) DFT at the
same sizes, so reference_fft.rs and its four comparison tests are dead
weight. gate get_powers_of_primitive_root as test-only (production twiddles
go through LayerTwiddles), collapse the evaluate_fft_cpu / interpolate_fft_cpu
wrappers into their sole callers, and narrow coset_lde_full_into to
pub(crate).

Move the inline test modules from bit_reversing.rs, goldilocks.rs and
extensions_goldilocks.rs into the dedicated tests/ directory.
---
 crypto/math/src/fft/bit_reversing.rs          |  33 ----
 crypto/math/src/fft/mod.rs                    |   4 -
 crypto/math/src/fft/reference_fft.rs          | 156 ------------------
 crypto/math/src/fft/roots_of_unity.rs         |  13 +-
 .../math/src/field/extensions_goldilocks.rs   |  26 +--
 crypto/math/src/field/goldilocks.rs           |  29 ----
 crypto/math/src/polynomial.rs                 |  56 +++----
 crypto/math/src/tests/bit_reversing_tests.rs  |  31 ++++
 crypto/math/src/tests/bowers_fft_tests.rs     |  72 +-------
 .../src/tests/extensions_goldilocks_tests.rs  |  23 +++
 crypto/math/src/tests/goldilocks_tests.rs     |  28 ++++
 crypto/math/src/tests/mod.rs                  |   3 +
 12 files changed, 115 insertions(+), 359 deletions(-)
 delete mode 100644 crypto/math/src/fft/reference_fft.rs
 create mode 100644 crypto/math/src/tests/bit_reversing_tests.rs
 create mode 100644 crypto/math/src/tests/extensions_goldilocks_tests.rs
 create mode 100644 crypto/math/src/tests/goldilocks_tests.rs

diff --git a/crypto/math/src/fft/bit_reversing.rs b/crypto/math/src/fft/bit_reversing.rs
index f225dd5e0..b83e87212 100644
--- a/crypto/math/src/fft/bit_reversing.rs
+++ b/crypto/math/src/fft/bit_reversing.rs
@@ -16,36 +16,3 @@ pub fn reverse_index(i: usize, size: u64) -> usize {
         i.reverse_bits() >> (usize::BITS - size.trailing_zeros())
     }
 }
-
-#[cfg(all(test, feature = "alloc"))]
-mod test {
-    use super::*;
-    use alloc::vec::Vec;
-
-    // TODO: proptest would be better.
-    #[test]
-    fn bit_reverse_permutation_works() {
-        let mut reversed: Vec<usize> = Vec::with_capacity(16);
-        for i in 0..reversed.capacity() {
-            reversed.push(reverse_index(i, reversed.capacity() as u64));
-        }
-        assert_eq!(
-            reversed[..],
-            [0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15]
-        );
-
-        in_place_bit_reverse_permute(&mut reversed[..]);
-        assert_eq!(
-            reversed[..],
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
-        );
-    }
-
-    #[test]
-    fn bit_reverse_permutation_edge_case() {
-        let mut edge_case = [0];
-
-        in_place_bit_reverse_permute(&mut edge_case[..]);
-        assert_eq!(edge_case[..], [0]);
-    }
-}
diff --git a/crypto/math/src/fft/mod.rs b/crypto/math/src/fft/mod.rs
index 70adcf6ed..fd0d1c4e2 100644
--- a/crypto/math/src/fft/mod.rs
+++ b/crypto/math/src/fft/mod.rs
@@ -5,9 +5,5 @@ pub mod errors;
 #[cfg(feature = "alloc")]
 pub mod roots_of_unity;
 
-/// Reference radix-2 FFT, used only to cross-check the production Bowers FFT.
-#[cfg(test)]
-pub(crate) mod reference_fft;
-
 #[cfg(all(test, feature = "alloc"))]
 pub(crate) mod test_helpers;
diff --git a/crypto/math/src/fft/reference_fft.rs b/crypto/math/src/fft/reference_fft.rs
deleted file mode 100644
index 2fca71f11..000000000
--- a/crypto/math/src/fft/reference_fft.rs
+++ /dev/null
@@ -1,156 +0,0 @@
-#[cfg(test)]
-use crate::field::{
-    element::FieldElement,
-    traits::{IsFFTField, IsField, IsSubFieldOf},
-};
-
-/// In-Place Radix-2 NR DIT FFT algorithm over a slice of two-adic field elements.
-/// It's required that the twiddle factors are in bit-reverse order. Else this function will not
-/// return fourier transformed values.
-/// Also the input size needs to be a power of two.
-/// It's recommended to use the current safe abstractions instead of this function.
-///
-/// Performs a fast fourier transform with the next attributes:
-/// - In-Place: an auxiliary vector of data isn't needed for the algorithm.
-/// - Radix-2: the algorithm halves the problem size log(n) times.
-/// - NR: natural to reverse order, meaning that the input is naturally ordered and the output will
-///   be bit-reversed ordered.
-/// - DIT: decimation in time
-///
-/// It supports values in a field E and domain in a subfield F.
-#[cfg(test)]
-pub fn in_place_nr_2radix_fft<F, E>(input: &mut [FieldElement<E>], twiddles: &[FieldElement<F>])
-where
-    F: IsFFTField + IsSubFieldOf<E>,
-    E: IsField,
-{
-    // divide input in groups, starting with 1, duplicating the number of groups in each stage.
-    let mut group_count = 1;
-    let mut group_size = input.len();
-
-    // for each group, there'll be group_size / 2 butterflies.
-    // a butterfly is the atomic operation of a FFT, e.g: (a, b) = (a + wb, a - wb).
-    // The 0.5 factor is what gives FFT its performance, it recursively halves the problem size
-    // (group size).
-
-    while group_count < input.len() {
-        #[allow(clippy::needless_range_loop)] // the suggestion would obfuscate a bit the algorithm
-        for group in 0..group_count {
-            let first_in_group = group * group_size;
-            let first_in_next_group = first_in_group + group_size / 2;
-
-            let w = &twiddles[group]; // a twiddle factor is used per group
-
-            for i in first_in_group..first_in_next_group {
-                let wi = w * &input[i + group_size / 2];
-
-                let y0 = &input[i] + &wi;
-                let y1 = &input[i] - &wi;
-
-                input[i] = y0;
-                input[i + group_size / 2] = y1;
-            }
-        }
-        group_count *= 2;
-        group_size /= 2;
-    }
-}
-
-/// In-Place Radix-2 RN DIT FFT algorithm. Only used in tests.
-#[cfg(test)]
-pub fn in_place_rn_2radix_fft<F>(input: &mut [FieldElement<F>], twiddles: &[FieldElement<F>])
-where
-    F: IsFFTField,
-{
-    // divide input in groups, starting with 1, duplicating the number of groups in each stage.
-    let mut group_count = 1;
-    let mut group_size = input.len();
-
-    // for each group, there'll be group_size / 2 butterflies.
-    // a butterfly is the atomic operation of a FFT, e.g: (a, b) = (a + wb, a - wb).
-    // The 0.5 factor is what gives FFT its performance, it recursively halves the problem size
-    // (group size).
-
-    while group_count < input.len() {
-        let step_to_next = 2 * group_count; // next butterfly in the group
-        let step_to_last = step_to_next * (group_size / 2 - 1);
-
-        for group in 0..group_count {
-            let w = &twiddles[group * group_size / 2];
-
-            for i in (group..=group + step_to_last).step_by(step_to_next) {
-                let wi = w * &input[i + group_count];
-
-                let y0 = &input[i] + &wi;
-                let y1 = &input[i] - &wi;
-
-                input[i] = y0;
-                input[i + group_count] = y1;
-            }
-        }
-        group_count *= 2;
-        group_size /= 2;
-    }
-}
-
-#[cfg(all(test, feature = "alloc"))]
-mod tests {
-    use crate::fft::bit_reversing::in_place_bit_reverse_permute;
-    use crate::fft::roots_of_unity::get_powers_of_primitive_root;
-    use crate::fft::test_helpers::naive_matrix_dft_test;
-    use crate::field::{test_fields::u64_test_field::U64TestField, traits::RootsConfig};
-    use proptest::{collection, prelude::*};
-
-    use super::*;
-
-    type F = U64TestField;
-    type FE = FieldElement<F>;
-
-    prop_compose! {
-        fn powers_of_two(max_exp: u8)(exp in 1..max_exp) -> usize { 1 << exp }
-        // max_exp cannot be multiple of the bits that represent a usize, generally 64 or 32.
-        // also it can't exceed the test field's two-adicity.
-    }
-    prop_compose! {
-        fn field_element()(num in any::<u64>().prop_filter("Avoid null coefficients", |x| x != &0)) -> FE {
-            FE::from(num)
-        }
-    }
-    prop_compose! {
-        fn field_vec(max_exp: u8)(vec in (1..max_exp).prop_flat_map(|i| collection::vec(field_element(), 1 << i))) -> alloc::vec::Vec<FE> {
-            vec
-        }
-    }
-    proptest! {
-        // Property-based test that ensures NR Radix-2 FFT gives the same result as a naive DFT.
-        #[test]
-        fn test_nr_2radix_fft_matches_naive_eval(coeffs in field_vec(8)) {
-            let expected = naive_matrix_dft_test(&coeffs);
-
-            let order = coeffs.len().trailing_zeros();
-            let twiddles = get_powers_of_primitive_root::<F>(order.into(), (1 << order) / 2, RootsConfig::BitReverse).unwrap();
-
-            let mut result = coeffs;
-            in_place_nr_2radix_fft::<F, F>(&mut result, &twiddles);
-            in_place_bit_reverse_permute(&mut result);
-
-            prop_assert_eq!(expected, result);
-        }
-
-        // Property-based test that ensures RN Radix-2 FFT gives the same result as a naive DFT.
-        #[test]
-        fn test_rn_2radix_fft_matches_naive_eval(coeffs in field_vec(8)) {
-            let expected = naive_matrix_dft_test(&coeffs);
-
-            let order = coeffs.len().trailing_zeros();
-            let twiddles = get_powers_of_primitive_root::<F>(order.into(), (1 << order) / 2, RootsConfig::Natural).unwrap();
-
-            let mut result = coeffs;
-            in_place_bit_reverse_permute(&mut result);
-            in_place_rn_2radix_fft(&mut result, &twiddles);
-
-            prop_assert_eq!(result, expected);
-        }
-
-    }
-}
diff --git a/crypto/math/src/fft/roots_of_unity.rs b/crypto/math/src/fft/roots_of_unity.rs
index e1f9d9b93..e3c0189cf 100644
--- a/crypto/math/src/fft/roots_of_unity.rs
+++ b/crypto/math/src/fft/roots_of_unity.rs
@@ -1,15 +1,20 @@
-use crate::field::{
-    element::FieldElement,
-    traits::{IsFFTField, RootsConfig},
-};
+use crate::field::{element::FieldElement, traits::IsFFTField};
 use alloc::vec::Vec;
 
 use crate::fft::errors::FFTError;
 
+// `RootsConfig` and the bit-reverse permutation are only used by the test-only
+// `get_powers_of_primitive_root` below.
+#[cfg(test)]
 use super::bit_reversing::in_place_bit_reverse_permute;
+#[cfg(test)]
+use crate::field::traits::RootsConfig;
 
 /// Returns a `Vec` of the powers of a `2^n`th primitive root of unity in some configuration
 /// `config`. For example, in a `Natural` config this would yield: w^0, w^1, w^2...
+///
+/// Test-only: production twiddle generation goes through `bowers_fft::LayerTwiddles`.
+#[cfg(test)]
 pub fn get_powers_of_primitive_root<F: IsFFTField>(
     n: u64,
     count: usize,
diff --git a/crypto/math/src/field/extensions_goldilocks.rs b/crypto/math/src/field/extensions_goldilocks.rs
index f12af9ce9..45fd7274b 100644
--- a/crypto/math/src/field/extensions_goldilocks.rs
+++ b/crypto/math/src/field/extensions_goldilocks.rs
@@ -95,7 +95,7 @@ impl ByteConversion for [FpE; 3] {
 // This means Fp2 = Fp[x] / (x^2 - 7)
 // Elements are represented as a0 + a1*w where w^2 = 7
 
-type FpE = FieldElement<GoldilocksField>;
+pub(crate) type FpE = FieldElement<GoldilocksField>;
 
 /// Degree 2 extension field of Goldilocks
 #[derive(Copy, Clone, Debug)]
@@ -569,27 +569,3 @@ impl HasDefaultTranscript for Degree3GoldilocksExtensionField {
 fn mul_by_7(a: &FpE) -> FpE {
     FpE::from_raw(mul_by_7_raw(*a.value()))
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::traits::ByteConversion;
-
-    #[test]
-    fn write_bytes_be_matches_as_bytes() {
-        let cases = [
-            FieldElement::<Degree3GoldilocksExtensionField>::zero(),
-            FieldElement::<Degree3GoldilocksExtensionField>::one(),
-            FieldElement::<Degree3GoldilocksExtensionField>::new([
-                FpE::from(1u64),
-                FpE::from(2u64),
-                FpE::from(3u64),
-            ]),
-        ];
-        for elem in &cases {
-            let mut buf = [0u8; 24];
-            elem.write_bytes_be(&mut buf);
-            assert_eq!(&buf[..], elem.as_bytes().as_slice());
-        }
-    }
-}
diff --git a/crypto/math/src/field/goldilocks.rs b/crypto/math/src/field/goldilocks.rs
index 9891071b3..082d57325 100644
--- a/crypto/math/src/field/goldilocks.rs
+++ b/crypto/math/src/field/goldilocks.rs
@@ -551,32 +551,3 @@ impl HasDefaultTranscript for GoldilocksField {
         }
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::traits::ByteConversion;
-
-    #[test]
-    fn write_bytes_be_matches_as_bytes() {
-        let cases = [
-            FieldElement::<GoldilocksField>::from(0u64),
-            FieldElement::<GoldilocksField>::from(1u64),
-            FieldElement::<GoldilocksField>::from(GOLDILOCKS_PRIME - 1),
-        ];
-        for elem in &cases {
-            let mut buf = [0u8; 8];
-            elem.write_bytes_be(&mut buf);
-            assert_eq!(&buf[..], elem.as_bytes().as_slice());
-        }
-    }
-
-    #[test]
-    fn write_bytes_be_matches_as_bytes_noncanonical() {
-        // Values stored as-is via from_raw are non-canonical (>= p) until serialized.
-        let elem = FieldElement::<GoldilocksField>::from_raw(GOLDILOCKS_PRIME + 5);
-        let mut buf = [0u8; 8];
-        elem.write_bytes_be(&mut buf);
-        assert_eq!(&buf[..], elem.as_bytes().as_slice());
-    }
-}
diff --git a/crypto/math/src/polynomial.rs b/crypto/math/src/polynomial.rs
index b6fb5f15a..f4351340d 100644
--- a/crypto/math/src/polynomial.rs
+++ b/crypto/math/src/polynomial.rs
@@ -957,7 +957,7 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
         coeffs.resize(len, FieldElement::zero());
         // padding with zeros will make FFT return more evaluations of the same polynomial.
 
-        evaluate_fft_cpu::<F, E>(&coeffs)
+        evaluate_fft_cpu_raw::<F, E>(&coeffs, true)
     }
 
     /// Same as `evaluate_fft` but returns the evaluations in bit-reversed order,
@@ -1017,7 +1017,22 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
     where
         E: Send + Sync,
     {
-        interpolate_fft_cpu::<F, E>(fft_evals)
+        let n = fft_evals.len();
+        if !n.is_power_of_two() {
+            return Err(FFTError::InputError(n));
+        }
+        let order = n.trailing_zeros() as u64;
+        let inv_twiddles = LayerTwiddles::<F>::new_inverse(order)
+            .ok_or(FFTError::DomainSizeError(order as usize))?;
+
+        let mut coeffs = fft_evals.to_vec();
+        // Bowers iFFT: bit-reverse first (natural -> bit-reversed), then DIT inverse butterflies
+        in_place_bit_reverse_permute(&mut coeffs);
+        dispatch_ifft(&mut coeffs, &inv_twiddles)?;
+
+        // Scale by 1/n
+        let scale_factor = FieldElement::from(n as u64).inv().unwrap();
+        Ok(Polynomial::new(&coeffs).scale_coeffs(&scale_factor))
     }
 
     /// Returns a new polynomial that interpolates offset `(w^i, fft_evals[i])`, with `w` being a
@@ -1073,7 +1088,7 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
     /// The buffer is cleared and reused: `buffer.clear(); buffer.extend_from_slice(evals);
     /// buffer.resize(lde_size, zero)`. When the capacity is sufficient, no heap allocation occurs.
     /// Weights are in the base field F — the scaling `w * coeff` uses mixed F×E multiplication.
-    pub fn coset_lde_full_into<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
+    pub(crate) fn coset_lde_full_into<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
         evals: &[FieldElement<E>],
         blowup_factor: usize,
         weights: &[FieldElement<F>],
@@ -1124,7 +1139,7 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
     /// 3. Zero-pad to N * blowup_factor
     /// 4. Forward FFT on the full buffer
     ///
-    /// Unlike [`coset_lde_full_into`], this skips the `clear + extend_from_slice` step
+    /// Unlike `coset_lde_full_into`, this skips the `clear + extend_from_slice` step
     /// since data is already in the buffer. Used for transpose elimination: columns are
     /// extracted directly into owned buffers, then expanded in-place.
     pub fn coset_lde_full_expand<F: IsFFTField + IsSubFieldOf<E> + Send + Sync>(
@@ -1189,14 +1204,6 @@ where
     Polynomial::interpolate_fft::<F>(values.as_slice()).unwrap()
 }
 
-pub fn evaluate_fft_cpu<F, E>(coeffs: &[FieldElement<E>]) -> Result<Vec<FieldElement<E>>, FFTError>
-where
-    F: IsFFTField + IsSubFieldOf<E>,
-    E: IsField + Send + Sync,
-{
-    evaluate_fft_cpu_raw::<F, E>(coeffs, true)
-}
-
 fn evaluate_fft_cpu_raw<F, E>(
     coeffs: &[FieldElement<E>],
     permute_to_natural: bool,
@@ -1221,31 +1228,6 @@ where
     Ok(result)
 }
 
-pub fn interpolate_fft_cpu<F, E>(
-    fft_evals: &[FieldElement<E>],
-) -> Result<Polynomial<FieldElement<E>>, FFTError>
-where
-    F: IsFFTField + IsSubFieldOf<E>,
-    E: IsField + Send + Sync,
-{
-    let n = fft_evals.len();
-    if !n.is_power_of_two() {
-        return Err(FFTError::InputError(n));
-    }
-    let order = n.trailing_zeros() as u64;
-    let inv_twiddles =
-        LayerTwiddles::<F>::new_inverse(order).ok_or(FFTError::DomainSizeError(order as usize))?;
-
-    let mut coeffs = fft_evals.to_vec();
-    // Bowers iFFT: bit-reverse first (natural → bit-reversed), then DIT inverse butterflies
-    in_place_bit_reverse_permute(&mut coeffs);
-    dispatch_ifft(&mut coeffs, &inv_twiddles)?;
-
-    // Scale by 1/n
-    let scale_factor = FieldElement::from(n as u64).inv().unwrap();
-    Ok(Polynomial::new(&coeffs).scale_coeffs(&scale_factor))
-}
-
 #[cfg(test)]
 impl<E: IsField> Polynomial<FieldElement<E>> {
     /// Multiplies two polynomials using FFT.
diff --git a/crypto/math/src/tests/bit_reversing_tests.rs b/crypto/math/src/tests/bit_reversing_tests.rs
new file mode 100644
index 000000000..42de5eeef
--- /dev/null
+++ b/crypto/math/src/tests/bit_reversing_tests.rs
@@ -0,0 +1,31 @@
+//! Tests for the bit-reverse permutation, relocated from `fft/bit_reversing.rs`.
+
+use crate::fft::bit_reversing::{in_place_bit_reverse_permute, reverse_index};
+use alloc::vec::Vec;
+
+// TODO: proptest would be better.
+#[test]
+fn bit_reverse_permutation_works() {
+    let mut reversed: Vec<usize> = Vec::with_capacity(16);
+    for i in 0..reversed.capacity() {
+        reversed.push(reverse_index(i, reversed.capacity() as u64));
+    }
+    assert_eq!(
+        reversed[..],
+        [0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15]
+    );
+
+    in_place_bit_reverse_permute(&mut reversed[..]);
+    assert_eq!(
+        reversed[..],
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+    );
+}
+
+#[test]
+fn bit_reverse_permutation_edge_case() {
+    let mut edge_case = [0];
+
+    in_place_bit_reverse_permute(&mut edge_case[..]);
+    assert_eq!(edge_case[..], [0]);
+}
diff --git a/crypto/math/src/tests/bowers_fft_tests.rs b/crypto/math/src/tests/bowers_fft_tests.rs
index 70a7107b8..322f8866d 100644
--- a/crypto/math/src/tests/bowers_fft_tests.rs
+++ b/crypto/math/src/tests/bowers_fft_tests.rs
@@ -4,11 +4,9 @@
 
 use crate::fft::bit_reversing::in_place_bit_reverse_permute;
 use crate::fft::bowers_fft::*;
-use crate::fft::reference_fft::in_place_nr_2radix_fft;
-use crate::fft::roots_of_unity::get_powers_of_primitive_root;
 use crate::field::element::FieldElement;
 use crate::field::goldilocks::GoldilocksField;
-use crate::field::traits::{IsFFTField, RootsConfig};
+use crate::field::traits::IsFFTField;
 use alloc::vec;
 use alloc::vec::Vec;
 use proptest::{collection, prelude::*};
@@ -414,55 +412,6 @@ fn test_fft_ifft_roundtrip_edge_cases() {
     assert_eq!(result, sparse, "Roundtrip failed for sparse input");
 }
 
-// =========================================================================
-// Bowers vs Native FFT comparison tests
-// =========================================================================
-
-fn compare_bowers_vs_native(input: &[FE]) {
-    let order = input.len().trailing_zeros() as u64;
-
-    // Native Cooley-Tukey FFT
-    let native_twiddles =
-        get_powers_of_primitive_root::<F>(order, (1 << order) / 2, RootsConfig::BitReverse)
-            .unwrap();
-    let mut native_result = input.to_vec();
-    in_place_nr_2radix_fft::<F, F>(&mut native_result, &native_twiddles);
-    in_place_bit_reverse_permute(&mut native_result);
-
-    // Bowers FFT
-    let layer_twiddles = LayerTwiddles::<F>::new(order).unwrap();
-    let mut bowers_result = input.to_vec();
-    bowers_fft_opt_fused(&mut bowers_result, &layer_twiddles).unwrap();
-    in_place_bit_reverse_permute(&mut bowers_result);
-
-    assert_eq!(bowers_result, native_result);
-}
-
-#[test]
-fn test_bowers_vs_native_various_sizes() {
-    for order in 0..=10u64 {
-        let input: Vec<FE> = (0..(1 << order)).map(|i| FE::from(i as u64)).collect();
-        compare_bowers_vs_native(&input);
-    }
-}
-
-#[test]
-fn test_bowers_vs_native_edge_cases() {
-    // All zeros
-    let zeros: Vec<FE> = (0..64).map(|_| FE::zero()).collect();
-    compare_bowers_vs_native(&zeros);
-
-    // All ones
-    let ones: Vec<FE> = (0..64).map(|_| FE::one()).collect();
-    compare_bowers_vs_native(&ones);
-
-    // Alternating
-    let alternating: Vec<FE> = (0..64)
-        .map(|i| if i % 2 == 0 { FE::zero() } else { FE::one() })
-        .collect();
-    compare_bowers_vs_native(&alternating);
-}
-
 // =========================================================================
 // Property-based tests (proptest)
 // =========================================================================
@@ -498,25 +447,6 @@ proptest! {
         prop_assert_eq!(result, expected);
     }
 
-    #[test]
-    fn proptest_bowers_matches_native_fft(coeffs in field_vec(8)) {
-        let order = coeffs.len().trailing_zeros() as u64;
-
-        // Native FFT
-        let native_twiddles = get_powers_of_primitive_root::<F>(order, (1 << order) / 2, RootsConfig::BitReverse).unwrap();
-        let mut native_result = coeffs.clone();
-        in_place_nr_2radix_fft::<F, F>(&mut native_result, &native_twiddles);
-        in_place_bit_reverse_permute(&mut native_result);
-
-        // Bowers FFT
-        let layer_twiddles = LayerTwiddles::<F>::new(order).unwrap();
-        let mut bowers_result = coeffs;
-        bowers_fft_opt_fused(&mut bowers_result, &layer_twiddles).unwrap();
-        in_place_bit_reverse_permute(&mut bowers_result);
-
-        prop_assert_eq!(bowers_result, native_result);
-    }
-
     #[test]
     fn proptest_bowers_fft_ifft_roundtrip(coeffs in field_vec(8)) {
         let n = coeffs.len();
diff --git a/crypto/math/src/tests/extensions_goldilocks_tests.rs b/crypto/math/src/tests/extensions_goldilocks_tests.rs
new file mode 100644
index 000000000..f13172fbc
--- /dev/null
+++ b/crypto/math/src/tests/extensions_goldilocks_tests.rs
@@ -0,0 +1,23 @@
+//! Tests for `field/extensions_goldilocks.rs`, relocated from its inline test module.
+
+use crate::field::element::FieldElement;
+use crate::field::extensions_goldilocks::{Degree3GoldilocksExtensionField, FpE};
+use crate::traits::{AsBytes, ByteConversion};
+
+#[test]
+fn write_bytes_be_matches_as_bytes() {
+    let cases = [
+        FieldElement::<Degree3GoldilocksExtensionField>::zero(),
+        FieldElement::<Degree3GoldilocksExtensionField>::one(),
+        FieldElement::<Degree3GoldilocksExtensionField>::new([
+            FpE::from(1u64),
+            FpE::from(2u64),
+            FpE::from(3u64),
+        ]),
+    ];
+    for elem in &cases {
+        let mut buf = [0u8; 24];
+        elem.write_bytes_be(&mut buf);
+        assert_eq!(&buf[..], elem.as_bytes().as_slice());
+    }
+}
diff --git a/crypto/math/src/tests/goldilocks_tests.rs b/crypto/math/src/tests/goldilocks_tests.rs
new file mode 100644
index 000000000..b1276224e
--- /dev/null
+++ b/crypto/math/src/tests/goldilocks_tests.rs
@@ -0,0 +1,28 @@
+//! Tests for `field/goldilocks.rs`, relocated from its inline test module.
+
+use crate::field::element::FieldElement;
+use crate::field::goldilocks::{GOLDILOCKS_PRIME, GoldilocksField};
+use crate::traits::{AsBytes, ByteConversion};
+
+#[test]
+fn write_bytes_be_matches_as_bytes() {
+    let cases = [
+        FieldElement::<GoldilocksField>::from(0u64),
+        FieldElement::<GoldilocksField>::from(1u64),
+        FieldElement::<GoldilocksField>::from(GOLDILOCKS_PRIME - 1),
+    ];
+    for elem in &cases {
+        let mut buf = [0u8; 8];
+        elem.write_bytes_be(&mut buf);
+        assert_eq!(&buf[..], elem.as_bytes().as_slice());
+    }
+}
+
+#[test]
+fn write_bytes_be_matches_as_bytes_noncanonical() {
+    // Values stored as-is via from_raw are non-canonical (>= p) until serialized.
+    let elem = FieldElement::<GoldilocksField>::from_raw(GOLDILOCKS_PRIME + 5);
+    let mut buf = [0u8; 8];
+    elem.write_bytes_be(&mut buf);
+    assert_eq!(&buf[..], elem.as_bytes().as_slice());
+}
diff --git a/crypto/math/src/tests/mod.rs b/crypto/math/src/tests/mod.rs
index 08403ac00..2f9cf0b35 100644
--- a/crypto/math/src/tests/mod.rs
+++ b/crypto/math/src/tests/mod.rs
@@ -1,8 +1,11 @@
 pub mod barycentric_tests;
+pub mod bit_reversing_tests;
 pub mod bowers_fft_tests;
+pub mod extensions_goldilocks_tests;
 pub mod fft_friendly_extensions_goldilocks_tests;
 pub mod fft_friendly_u64_goldilocks_tests;
 pub mod fft_tests;
 pub mod field_element_tests;
+pub mod goldilocks_tests;
 pub mod polynomial_tests;
 pub mod test_fields_tests;

From 2c49cea2ea5570f4683c0ec1025ffac86333ce3a Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 22 May 2026 17:25:30 -0300
Subject: [PATCH 09/10] refactor(math): remove dead polynomial-division
 cluster, fix zero-offset panic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses PR review feedback.

Reviewers flagged that `#[cfg(test)]`-gating previously-public methods is a
breaking change. Rather than keep them as `pub(crate)` dead code, the cluster
is removed outright — none of it has a production caller:

- Delete `Polynomial::{fast_division, invert_polynomial_mod,
  fast_fft_multiplication, truncate, reverse}` and `evaluate_fft_bit_reversed`,
  plus their 8 tests. The fast polynomial-division algorithm had no production
  use; it existed only to be tested.
- `get_powers_of_primitive_root` stays `#[cfg(test)]` — it is a genuine test
  helper used to validate the production FFT, not dead code.

Also fixes the pre-existing panic reviewers noted in `interpolate_offset_fft`:
`offset.inv().unwrap()` panicked on a zero coset offset. Add
`FFTError::InvalidCosetOffset` and propagate it instead.
---
 crypto/math/src/fft/errors.rs             |   5 +
 crypto/math/src/polynomial.rs             | 142 +---------------------
 crypto/math/src/tests/fft_tests.rs        |  45 -------
 crypto/math/src/tests/polynomial_tests.rs |  15 ---
 4 files changed, 8 insertions(+), 199 deletions(-)

diff --git a/crypto/math/src/fft/errors.rs b/crypto/math/src/fft/errors.rs
index 8ebb83d99..569b2bd68 100644
--- a/crypto/math/src/fft/errors.rs
+++ b/crypto/math/src/fft/errors.rs
@@ -8,6 +8,8 @@ pub enum FFTError {
     InputError(usize),
     OrderError(u64),
     DomainSizeError(usize),
+    /// A coset offset of zero was supplied; it has no multiplicative inverse.
+    InvalidCosetOffset,
 }
 
 impl Display for FFTError {
@@ -23,6 +25,9 @@ impl Display for FFTError {
             FFTError::DomainSizeError(_) => {
                 write!(f, "Domain size exceeds two adicity of the field")
             }
+            FFTError::InvalidCosetOffset => {
+                write!(f, "Coset offset is zero, which is not invertible")
+            }
         }
     }
 }
diff --git a/crypto/math/src/polynomial.rs b/crypto/math/src/polynomial.rs
index f4351340d..eec7f5bcf 100644
--- a/crypto/math/src/polynomial.rs
+++ b/crypto/math/src/polynomial.rs
@@ -195,25 +195,6 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
                 .collect(),
         }
     }
-
-    /// Only used by the test-only `fast_division` / `invert_polynomial_mod`.
-    #[cfg(test)]
-    pub fn truncate(&self, k: usize) -> Self {
-        if k == 0 {
-            Self::zero()
-        } else {
-            Self::new(&self.coefficients[0..k.min(self.coefficients.len())])
-        }
-    }
-
-    /// Only used by the test-only `fast_division`.
-    #[cfg(test)]
-    pub fn reverse(&self, d: usize) -> Self {
-        let mut coeffs = self.coefficients.clone();
-        coeffs.resize(d + 1, FieldElement::zero());
-        coeffs.reverse();
-        Self::new(&coeffs)
-    }
 }
 
 /// Pads a polynomial with zeros until the desired length
@@ -960,37 +941,6 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
         evaluate_fft_cpu_raw::<F, E>(&coeffs, true)
     }
 
-    /// Same as `evaluate_fft` but returns the evaluations in bit-reversed order,
-    /// skipping the final natural-order permutation. Use when the consumer expects
-    /// bit-reversed input (e.g. FRI commit phase, which pairs consecutive values as
-    /// {f(x), f(-x)}).
-    ///
-    /// Currently exercised only by the FFT property tests; gated to keep it out
-    /// of the production surface until a non-test caller needs it.
-    #[cfg(test)]
-    pub fn evaluate_fft_bit_reversed<F: IsFFTField + IsSubFieldOf<E>>(
-        poly: &Polynomial<FieldElement<E>>,
-        blowup_factor: usize,
-        domain_size: Option<usize>,
-    ) -> Result<Vec<FieldElement<E>>, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let domain_size = domain_size.unwrap_or(0);
-        let len = core::cmp::max(poly.coeff_len(), domain_size).next_power_of_two() * blowup_factor;
-        if len.trailing_zeros() as u64 > F::TWO_ADICITY {
-            return Err(FFTError::DomainSizeError(len.trailing_zeros() as usize));
-        }
-        if poly.coefficients().is_empty() {
-            return Ok(vec![FieldElement::zero(); len]);
-        }
-
-        let mut coeffs = poly.coefficients().to_vec();
-        coeffs.resize(len, FieldElement::zero());
-
-        evaluate_fft_cpu_raw::<F, E>(&coeffs, false)
-    }
-
     /// Returns `N` evaluations with an offset of this polynomial using FFT over a domain in a subfield F of E
     /// (so the results are P(w^i), with w being a primitive root of unity).
     /// `N = max(self.coeff_len(), domain_size).next_power_of_two() * blowup_factor`.
@@ -1046,7 +996,9 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
         E: Send + Sync,
     {
         let scaled = Polynomial::interpolate_fft::<F>(fft_evals)?;
-        Ok(scaled.scale(&offset.inv().unwrap()))
+        // A zero coset offset has no inverse; report it instead of panicking.
+        let offset_inv = offset.inv().map_err(|_| FFTError::InvalidCosetOffset)?;
+        Ok(scaled.scale(&offset_inv))
     }
 
     /// Compute the coset LDE with pre-computed twiddle factors and pre-computed weights.
@@ -1227,91 +1179,3 @@ where
     }
     Ok(result)
 }
-
-#[cfg(test)]
-impl<E: IsField> Polynomial<FieldElement<E>> {
-    /// Multiplies two polynomials using FFT.
-    pub fn fast_fft_multiplication<F: IsFFTField + IsSubFieldOf<E>>(
-        &self,
-        other: &Self,
-    ) -> Result<Self, FFTError>
-    where
-        E: Send + Sync,
-    {
-        let domain_size = self.degree() + other.degree() + 1;
-        let p = Polynomial::evaluate_fft::<F>(self, 1, Some(domain_size))?;
-        let q = Polynomial::evaluate_fft::<F>(other, 1, Some(domain_size))?;
-        let r = p.into_iter().zip(q).map(|(a, b)| a * b).collect::<Vec<_>>();
-
-        Polynomial::interpolate_fft::<F>(&r)
-    }
-
-    /// Divides two polynomials with remainder using FFT.
-    pub fn fast_division<F: IsSubFieldOf<E> + IsFFTField>(
-        &self,
-        divisor: &Self,
-    ) -> Result<(Self, Self), FFTError>
-    where
-        E: Send + Sync,
-    {
-        use crate::field::errors::FieldError;
-
-        let n = self.degree();
-        let m = divisor.degree();
-        if divisor.coefficients.is_empty()
-            || divisor
-                .coefficients
-                .iter()
-                .all(|c| c == &FieldElement::zero())
-        {
-            return Err(FieldError::DivisionByZero.into());
-        }
-        if n < m {
-            return Ok((Self::zero(), self.clone()));
-        }
-        let d = n - m;
-        let a_rev = self.reverse(n);
-        let b_rev = divisor.reverse(m);
-        let inv_b_rev = b_rev.invert_polynomial_mod::<F>(d + 1)?;
-        let q = a_rev
-            .fast_fft_multiplication::<F>(&inv_b_rev)?
-            .truncate(d + 1)
-            .reverse(d);
-
-        let r = self - q.fast_fft_multiplication::<F>(divisor)?;
-        Ok((q, r))
-    }
-
-    /// Computes the inverse of polynomial P modulo x^k using Newton iteration.
-    pub fn invert_polynomial_mod<F: IsSubFieldOf<E> + IsFFTField>(
-        &self,
-        k: usize,
-    ) -> Result<Self, FFTError>
-    where
-        E: Send + Sync,
-    {
-        use crate::field::errors::FieldError;
-
-        if self.coefficients.is_empty()
-            || self.coefficients.iter().all(|c| c == &FieldElement::zero())
-        {
-            return Err(FieldError::DivisionByZero.into());
-        }
-        let mut q = Self::new(&[self.coefficients[0].inv()?]);
-        let mut current_precision = 1;
-
-        let two = Self::new(&[FieldElement::<F>::one() + FieldElement::one()]);
-        while current_precision < k {
-            current_precision *= 2;
-            let temp = self
-                .fast_fft_multiplication::<F>(&q)?
-                .truncate(current_precision);
-            let correction = &two - temp;
-            q = q
-                .fast_fft_multiplication::<F>(&correction)?
-                .truncate(current_precision);
-        }
-
-        Ok(q.truncate(k))
-    }
-}
diff --git a/crypto/math/src/tests/fft_tests.rs b/crypto/math/src/tests/fft_tests.rs
index 61fc48c76..50b618545 100644
--- a/crypto/math/src/tests/fft_tests.rs
+++ b/crypto/math/src/tests/fft_tests.rs
@@ -48,7 +48,6 @@ mod fft_helpers_test {
 mod fft_polynomial_tests {
     use crate::field::traits::IsField;
 
-    use crate::fft::bit_reversing::in_place_bit_reverse_permute;
     use crate::fft::roots_of_unity::{
         get_powers_of_primitive_root, get_powers_of_primitive_root_coset,
     };
@@ -228,32 +227,6 @@ mod fft_polynomial_tests {
                 prop_assert_eq!(poly, new_poly);
             }
 
-            // Property-based test that ensures evaluate_fft_bit_reversed returns the same
-            // values as evaluate_fft followed by an in-place bit-reverse permutation,
-            // across varying blowup factors.
-            #[test]
-            fn test_fft_bit_reversed_matches_evaluate_fft_then_permute(poly in poly(6), blowup_factor in powers_of_two(4)) {
-                let mut expected = Polynomial::evaluate_fft::<F>(&poly, blowup_factor, None).unwrap();
-                in_place_bit_reverse_permute(&mut expected);
-                let got = Polynomial::evaluate_fft_bit_reversed::<F>(&poly, blowup_factor, None).unwrap();
-                prop_assert_eq!(got, expected);
-            }
-
-            #[test]
-            fn test_fft_multiplication_works(poly in poly(7), other in poly(7)) {
-                prop_assert_eq!(poly.fast_fft_multiplication::<F>(&other).unwrap(), poly * other);
-            }
-
-            #[test]
-            fn test_fft_division_works(poly in non_zero_poly(7), other in non_zero_poly(7)) {
-                prop_assert_eq!(poly.fast_division::<F>(&other).unwrap(), poly.long_division_with_remainder(&other));
-            }
-
-            #[test]
-            fn test_invert_polynomial_mod_works(poly in non_zero_poly(7), k in powers_of_two(4)) {
-                let inverted_poly = poly.invert_polynomial_mod::<F>(k).unwrap();
-                prop_assert_eq!((poly * inverted_poly).truncate(k), Polynomial::new(&[FE::one()]));
-            }
         }
 
         #[test]
@@ -265,24 +238,6 @@ mod fft_polynomial_tests {
                 Polynomial::new(&[FE::new(0), FE::new(0), FE::new(0), FE::new(2)])
             );
         }
-
-        #[test]
-        fn fft_bit_reversed_handles_domain_size_greater_than_coeff_len() {
-            let poly = Polynomial::new(&[FE::new(1), FE::new(2), FE::new(3)]);
-            let domain_size = 32;
-            let mut expected = Polynomial::evaluate_fft::<F>(&poly, 1, Some(domain_size)).unwrap();
-            in_place_bit_reverse_permute(&mut expected);
-            let got =
-                Polynomial::evaluate_fft_bit_reversed::<F>(&poly, 1, Some(domain_size)).unwrap();
-            assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn fft_bit_reversed_returns_zeros_for_empty_polynomial() {
-            let poly: Polynomial<FE> = Polynomial::new(&[]);
-            let got = Polynomial::evaluate_fft_bit_reversed::<F>(&poly, 1, Some(8)).unwrap();
-            assert_eq!(got, vec![FE::zero(); 8]);
-        }
     }
 
     #[test]
diff --git a/crypto/math/src/tests/polynomial_tests.rs b/crypto/math/src/tests/polynomial_tests.rs
index c78e78f7e..a066ad1af 100644
--- a/crypto/math/src/tests/polynomial_tests.rs
+++ b/crypto/math/src/tests/polynomial_tests.rs
@@ -512,21 +512,6 @@ mod tests {
         assert_eq!(dpdx, Polynomial::new(&[FE::new(0)]));
     }
 
-    #[test]
-    fn test_reverse() {
-        let p = Polynomial::new(&[FE::new(3), FE::new(2), FE::new(1)]);
-        assert_eq!(
-            p.reverse(3),
-            Polynomial::new(&[FE::new(0), FE::new(1), FE::new(2), FE::new(3)])
-        );
-    }
-
-    #[test]
-    fn test_truncate() {
-        let p = Polynomial::new(&[FE::new(3), FE::new(2), FE::new(1)]);
-        assert_eq!(p.truncate(2), Polynomial::new(&[FE::new(3), FE::new(2)]));
-    }
-
     #[test]
     fn test_print_as_sage_poly() {
         let p = Polynomial::new(&[FE::new(1), FE::new(2), FE::new(3)]);

From 27bd36f1d7baa0e9198b59166e3f568bcc9185ab Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 22 May 2026 18:02:20 -0300
Subject: [PATCH 10/10] refactor(math): remove dead Polynomial APIs and
 self-referential tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continues the dead-code audit of polynomial.rs. Every item removed here
has no production caller — each is a function reachable only from its own
tests, or a test-file algorithm that tests itself.

Removed from polynomial.rs:
- to_extension (zero callers anywhere)
- long_division_with_remainder, leading_coefficient
- ruffini_division_inplace
- interpolate_coset_eval_with_g_n_inv (base-field variant)

Removed the matching self-referential tests/helpers from
polynomial_tests.rs (division_works, division_by_zero_degree_polynomial_works,
test_xgcd, ruffini_inplace_* and the div_with_ref / xgcd / ruffini_division
helpers).

The 3 barycentric tests that exercised the base-field
interpolate_coset_eval_with_g_n_inv now call the production-used
interpolate_coset_eval_ext_with_g_n_inv (instantiated E=F), so they keep
covering the real code path.
---
 crypto/math/src/polynomial.rs              |  99 ---------------
 crypto/math/src/tests/barycentric_tests.rs |   7 +-
 crypto/math/src/tests/polynomial_tests.rs  | 136 +--------------------
 3 files changed, 4 insertions(+), 238 deletions(-)

diff --git a/crypto/math/src/polynomial.rs b/crypto/math/src/polynomial.rs
index eec7f5bcf..b7aeba734 100644
--- a/crypto/math/src/polynomial.rs
+++ b/crypto/math/src/polynomial.rs
@@ -67,15 +67,6 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
         }
     }
 
-    /// Returns the coefficient accompanying x^degree
-    pub fn leading_coefficient(&self) -> FieldElement<F> {
-        if let Some(coefficient) = self.coefficients.last() {
-            coefficient.clone()
-        } else {
-            FieldElement::zero()
-        }
-    }
-
     /// Returns coefficients of the polynomial as an array
     /// \[c_0, c_1, c_2, ..., c_n\]
     /// that represents the polynomial
@@ -89,29 +80,6 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
         self.coefficients().len()
     }
 
-    /// Computes quotient and remainder of polynomial division.
-    ///
-    /// Output: (quotient, remainder)
-    pub fn long_division_with_remainder(self, dividend: &Self) -> (Self, Self) {
-        if dividend.degree() > self.degree() {
-            (Polynomial::zero(), self)
-        } else {
-            let mut n = self;
-            let mut q: Vec<FieldElement<F>> = vec![FieldElement::zero(); n.degree() + 1];
-            let denominator = dividend.leading_coefficient().inv().unwrap();
-            while n != Polynomial::zero() && n.degree() >= dividend.degree() {
-                let new_coefficient = n.leading_coefficient() * &denominator;
-                q[n.degree() - dividend.degree()] = new_coefficient.clone();
-                let d = dividend.mul_with_ref(&Polynomial::new_monomial(
-                    new_coefficient,
-                    n.degree() - dividend.degree(),
-                ));
-                n = n - d;
-            }
-            (Polynomial::new(&q), n)
-        }
-    }
-
     pub fn mul_with_ref(&self, factor: &Self) -> Self {
         let degree = self.degree() + factor.degree();
         let mut coefficients = vec![FieldElement::zero(); degree + 1];
@@ -179,22 +147,6 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
         }
         parts
     }
-
-    /// Embeds the coefficients of a polynomial into an extension field
-    /// For example, given a polynomial with coefficients in F_p, returns the same
-    /// polynomial with its coefficients as elements in F_{p^2}
-    pub fn to_extension<L: IsField>(self) -> Polynomial<FieldElement<L>>
-    where
-        F: IsSubFieldOf<L>,
-    {
-        Polynomial {
-            coefficients: self
-                .coefficients
-                .into_iter()
-                .map(|x| x.to_extension::<L>())
-                .collect(),
-        }
-    }
 }
 
 /// Pads a polynomial with zeros until the desired length
@@ -726,47 +678,6 @@ where
     denoms
 }
 
-/// Like `interpolate_coset_eval` but takes a precomputed `g_n_inv = (g^N)^{-1}`.
-///
-/// Use this when evaluating multiple columns at the same coset — the inverse is
-/// constant across all columns and should be computed once.
-///
-/// Both `coset_offset_pow_n` and `g_n_inv` stay in the base field F. The function
-/// uses F×E→E mixed arithmetic for the final multiplication, avoiding any field
-/// embedding/conversion overhead.
-pub fn interpolate_coset_eval_with_g_n_inv<F, E>(
-    z_pow_n: &FieldElement<E>,
-    coset_offset_pow_n: &FieldElement<F>,
-    n_inv: &FieldElement<F>,
-    g_n_inv: &FieldElement<F>,
-    coset_points: &[FieldElement<F>],
-    evaluations: &[FieldElement<F>],
-    inv_denoms: &[FieldElement<E>],
-) -> FieldElement<E>
-where
-    F: IsSubFieldOf<E>,
-    E: IsField,
-{
-    debug_assert_eq!(coset_points.len(), evaluations.len());
-    debug_assert_eq!(coset_points.len(), inv_denoms.len());
-
-    // sum = sum_{i} (g*w^i) * f(g*w^i) / (z - g*w^i)
-    // point * eval is in base field F; multiply by inv_d lifts to E
-    let sum: FieldElement<E> = coset_points
-        .iter()
-        .zip(evaluations.iter())
-        .zip(inv_denoms.iter())
-        .fold(FieldElement::<E>::zero(), |acc, ((point, eval), inv_d)| {
-            acc + (point * eval) * inv_d
-        });
-
-    // f(z) = (z^N - g^N) / (N * g^N) * sum
-    // All scalar factors in base field F; vanishing via sub_subfield.
-    let vanishing = z_pow_n.sub_subfield(coset_offset_pow_n); // E - F → E
-    let scalar = n_inv * g_n_inv; // F * F → F
-    &scalar * &(vanishing * &sum) // F × E → E
-}
-
 /// Like `interpolate_coset_eval_ext` but takes a precomputed `g_n_inv = (g^N)^{-1}`.
 ///
 /// Both `coset_offset_pow_n` and `g_n_inv` stay in the base field F.
@@ -859,16 +770,6 @@ impl<F: IsField> Polynomial<FieldElement<F>> {
         }
         Ok(result)
     }
-
-    /// Computes quotient with `x - b` in place.
-    pub fn ruffini_division_inplace(&mut self, b: &FieldElement<F>) {
-        let mut c = FieldElement::zero();
-        for coeff in self.coefficients.iter_mut().rev() {
-            *coeff = &*coeff + b * &c;
-            core::mem::swap(coeff, &mut c);
-        }
-        self.coefficients.pop();
-    }
 }
 
 // =============================================================================
diff --git a/crypto/math/src/tests/barycentric_tests.rs b/crypto/math/src/tests/barycentric_tests.rs
index aff4afbc6..a1ea40236 100644
--- a/crypto/math/src/tests/barycentric_tests.rs
+++ b/crypto/math/src/tests/barycentric_tests.rs
@@ -3,7 +3,6 @@ use crate::field::element::FieldElement;
 use crate::field::goldilocks::GoldilocksField;
 use crate::polynomial::{
     Polynomial, barycentric_inv_denoms, interpolate_coset_eval_ext_with_g_n_inv,
-    interpolate_coset_eval_with_g_n_inv,
 };
 
 type FE = FieldElement<GoldilocksField>;
@@ -32,7 +31,7 @@ fn barycentric_matches_horner_simple() {
     let n_inv = FE::from(n as u64).inv().unwrap();
     let g_n_inv = g_pow_n.inv().unwrap();
     let inv_denoms = barycentric_inv_denoms(&z, &coset_points);
-    let result = interpolate_coset_eval_with_g_n_inv(
+    let result = interpolate_coset_eval_ext_with_g_n_inv(
         &z_pow_n,
         &g_pow_n,
         &n_inv,
@@ -65,7 +64,7 @@ fn barycentric_matches_horner_various_sizes() {
         let n_inv = FE::from(n as u64).inv().unwrap();
         let g_n_inv = g_pow_n.inv().unwrap();
         let inv_denoms = barycentric_inv_denoms(&z, &coset_points);
-        let result = interpolate_coset_eval_with_g_n_inv(
+        let result = interpolate_coset_eval_ext_with_g_n_inv(
             &z_pow_n,
             &g_pow_n,
             &n_inv,
@@ -136,7 +135,7 @@ fn barycentric_from_lde_stride() {
     let n_inv = FE::from(n as u64).inv().unwrap();
     let g_n_inv = g_pow_n.inv().unwrap();
     let inv_denoms = barycentric_inv_denoms(&z, &trace_coset);
-    let result = interpolate_coset_eval_with_g_n_inv(
+    let result = interpolate_coset_eval_ext_with_g_n_inv(
         &z_pow_n,
         &g_pow_n,
         &n_inv,
diff --git a/crypto/math/src/tests/polynomial_tests.rs b/crypto/math/src/tests/polynomial_tests.rs
index a066ad1af..ca1bfde22 100644
--- a/crypto/math/src/tests/polynomial_tests.rs
+++ b/crypto/math/src/tests/polynomial_tests.rs
@@ -2,7 +2,7 @@
 mod tests {
     use crate::field::element::FieldElement;
     use crate::field::goldilocks::GoldilocksField;
-    use crate::field::traits::{IsField, IsPrimeField, IsSubFieldOf};
+    use crate::field::traits::{IsField, IsPrimeField};
     use crate::polynomial::{Polynomial, pad_with_zero_coefficients};
     use alloc::string::{String, ToString};
     use alloc::{format, vec, vec::Vec};
@@ -35,72 +35,6 @@ mod tests {
         Polynomial::new(&derivative)
     }
 
-    /// Computes the quotient of the division of P(x) with x - b using Ruffini's rule
-    fn ruffini_division<F, L>(
-        poly: &Polynomial<FieldElement<F>>,
-        b: &FieldElement<L>,
-    ) -> Polynomial<FieldElement<L>>
-    where
-        L: IsField,
-        F: IsSubFieldOf<L>,
-    {
-        if let Some(c) = poly.coefficients().last() {
-            let mut c = c.clone().to_extension();
-            let mut coefficients = Vec::with_capacity(poly.degree());
-            for coeff in poly.coefficients().iter().rev().skip(1) {
-                coefficients.push(c.clone());
-                c = coeff + c * b;
-            }
-            coefficients = coefficients.into_iter().rev().collect();
-            Polynomial::new(&coefficients)
-        } else {
-            Polynomial::zero()
-        }
-    }
-
-    /// Computes quotient only (discards remainder)
-    fn div_with_ref<F: IsField>(
-        poly: Polynomial<FieldElement<F>>,
-        dividend: &Polynomial<FieldElement<F>>,
-    ) -> Polynomial<FieldElement<F>> {
-        let (quotient, _remainder) = poly.long_division_with_remainder(dividend);
-        quotient
-    }
-
-    /// Extended Euclidean Algorithm for polynomials
-    #[allow(clippy::type_complexity)]
-    fn xgcd<F: IsField>(
-        poly: &Polynomial<FieldElement<F>>,
-        y: &Polynomial<FieldElement<F>>,
-    ) -> (
-        Polynomial<FieldElement<F>>,
-        Polynomial<FieldElement<F>>,
-        Polynomial<FieldElement<F>>,
-    ) {
-        let one = Polynomial::new(&[FieldElement::one()]);
-        let zero = Polynomial::zero();
-        let (mut old_r, mut r) = (poly.clone(), y.clone());
-        let (mut old_s, mut s) = (one.clone(), zero.clone());
-        let (mut old_t, mut t) = (zero.clone(), one.clone());
-
-        while r != Polynomial::zero() {
-            let quotient = div_with_ref(old_r.clone(), &r);
-            old_r = old_r - &quotient * &r;
-            core::mem::swap(&mut old_r, &mut r);
-            old_s = old_s - &quotient * &s;
-            core::mem::swap(&mut old_s, &mut s);
-            old_t = old_t - &quotient * &t;
-            core::mem::swap(&mut old_t, &mut t);
-        }
-
-        let lcinv = old_r.leading_coefficient().inv().unwrap();
-        (
-            old_s.scale_coeffs(&lcinv),
-            old_t.scale_coeffs(&lcinv),
-            old_r.scale_coeffs(&lcinv),
-        )
-    }
-
     /// Print the polynomial as a string ready to be used in SageMath
     fn print_as_sage_poly<F: IsPrimeField>(
         poly: &Polynomial<FieldElement<F>>,
@@ -297,23 +231,6 @@ mod tests {
         );
     }
 
-    #[test]
-    fn division_works() {
-        let p1 = Polynomial::new(&[FE::new(1), FE::new(3)]);
-        let p2 = Polynomial::new(&[FE::new(1), FE::new(3)]);
-        let p3 = p1.mul_with_ref(&p2);
-        assert_eq!(div_with_ref(p3, &p2), p1);
-    }
-
-    #[test]
-    fn division_by_zero_degree_polynomial_works() {
-        let four = FE::new(4);
-        let two = FE::new(2);
-        let p1 = Polynomial::new(&[four, four]);
-        let p2 = Polynomial::new(&[two]);
-        assert_eq!(Polynomial::new(&[two, two]), div_with_ref(p1, &p2));
-    }
-
     #[test]
     fn evaluate_constant_polynomial_returns_constant() {
         let three = FE::new(3);
@@ -446,57 +363,6 @@ mod tests {
         assert_eq!(p1, &p1_expected);
     }
 
-    use proptest::prelude::*;
-    proptest! {
-        #[test]
-        fn ruffini_inplace_equals_division(p in any::<Vec<u64>>(), b in any::<u64>()) {
-            let p: Vec<_> = p.into_iter().map(FE::from).collect();
-            let mut p = Polynomial::new(&p);
-            let b = FE::from(b);
-
-            let p_ref = p.clone();
-            let m = Polynomial::new_monomial(FE::one(), 1) - b;
-
-            p.ruffini_division_inplace(&b);
-            prop_assert_eq!(p, div_with_ref(p_ref, &m));
-        }
-    }
-
-    proptest! {
-        #[test]
-        fn ruffini_inplace_equals_ruffini(p in any::<Vec<u64>>(), b in any::<u64>()) {
-            let p: Vec<_> = p.into_iter().map(FE::from).collect();
-            let mut p = Polynomial::new(&p);
-            let b = FE::from(b);
-            let q = ruffini_division(&p, &b);
-            p.ruffini_division_inplace(&b);
-            prop_assert_eq!(q, p);
-        }
-    }
-    #[test]
-    fn test_xgcd() {
-        // Case 1: Simple polynomials
-        let p1 = Polynomial::new(&[FE::new(1), FE::new(0), FE::new(1)]); // x^2 + 1
-        let p2 = Polynomial::new(&[FE::new(1), FE::new(1)]); // x + 1
-        let (a, b, g) = xgcd(&p1, &p2);
-        // Check that a * p1 + b * p2 = g
-        let lhs = a.mul_with_ref(&p1) + b.mul_with_ref(&p2);
-        assert_eq!(lhs, g);
-        assert_eq!(g, Polynomial::new(&[FE::new(1)]));
-
-        // x^2-1 :
-        let p3 = Polynomial::new(&[-FE::new(1), FE::new(0), FE::new(1)]);
-        // x^3-x = x(x^2-1)
-        let p4 = Polynomial::new(&[FE::new(0), -FE::new(1), FE::new(0), FE::new(1)]);
-        let (a, b, g) = xgcd(&p3, &p4);
-
-        let lhs = a.mul_with_ref(&p3) + b.mul_with_ref(&p4);
-        assert_eq!(a, Polynomial::new(&[FE::new(1)]));
-        assert_eq!(b, Polynomial::zero());
-        assert_eq!(lhs, g);
-        assert_eq!(g, p3);
-    }
-
     #[test]
     fn test_differentiate() {
         // 3x^2 + 2x + 42