From f7f842bba7b9e96d58398781480d2ca11abc46bf Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Wed, 25 Feb 2026 19:00:35 -0300
Subject: [PATCH 01/34] bench vs others

---
 .gitignore                                 |    1 +
 bench_vs/README.md                         |   59 +
 bench_vs/run.sh                            |  195 +
 bench_vs/sp1/fibonacci/.tldr/daemon.pid    |    1 +
 bench_vs/sp1/fibonacci/.tldr/status        |    1 +
 bench_vs/sp1/fibonacci/.tldrignore         |   84 +
 bench_vs/sp1/fibonacci/Cargo.lock          | 6182 ++++++++++++++++++++
 bench_vs/sp1/fibonacci/Cargo.toml          |    3 +
 bench_vs/sp1/fibonacci/program/Cargo.toml  |    7 +
 bench_vs/sp1/fibonacci/program/src/main.rs |   14 +
 bench_vs/sp1/fibonacci/rust-toolchain      |    3 +
 bench_vs/sp1/fibonacci/script/Cargo.toml   |   10 +
 bench_vs/sp1/fibonacci/script/build.rs     |    5 +
 bench_vs/sp1/fibonacci/script/src/main.rs  |   47 +
 14 files changed, 6612 insertions(+)
 create mode 100644 bench_vs/README.md
 create mode 100755 bench_vs/run.sh
 create mode 100644 bench_vs/sp1/fibonacci/.tldr/daemon.pid
 create mode 100644 bench_vs/sp1/fibonacci/.tldr/status
 create mode 100644 bench_vs/sp1/fibonacci/.tldrignore
 create mode 100644 bench_vs/sp1/fibonacci/Cargo.lock
 create mode 100644 bench_vs/sp1/fibonacci/Cargo.toml
 create mode 100644 bench_vs/sp1/fibonacci/program/Cargo.toml
 create mode 100644 bench_vs/sp1/fibonacci/program/src/main.rs
 create mode 100644 bench_vs/sp1/fibonacci/rust-toolchain
 create mode 100644 bench_vs/sp1/fibonacci/script/Cargo.toml
 create mode 100644 bench_vs/sp1/fibonacci/script/build.rs
 create mode 100644 bench_vs/sp1/fibonacci/script/src/main.rs

diff --git a/.gitignore b/.gitignore
index 9c826f0d9..3ef9f8283 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ executor/program_artifacts/
 # Shared cargo target directory for ELF builds
 executor/shared_target/
 
+
diff --git a/bench_vs/README.md b/bench_vs/README.md
new file mode 100644
index 000000000..0a20304c3
--- /dev/null
+++ b/bench_vs/README.md
@@ -0,0 +1,59 @@
+# Lambda VM vs SP1 v6 Benchmark
+
+Compares proving time for an identical u64 wrapping Fibonacci computation.
+
+## Prerequisites
+
+1. **Lambda VM CLI** (built from this repo):
+   ```bash
+   cargo build --release -p cli
+   ```
+
+2. **SP1 toolchain** (Succinct's prover):
+   ```bash
+   curl -L https://sp1up.succinct.xyz | bash
+   sp1up
+   ```
+
+3. **RISC-V assembler** — Homebrew clang + ld.lld (macOS):
+   ```bash
+   brew install llvm
+   ```
+
+## Usage
+
+```bash
+# Default series: 1k, 10k, 100k, 300k iterations
+./bench_vs/run.sh
+
+# Custom series
+./bench_vs/run.sh -n 1000 50000
+
+# Run only one prover
+./bench_vs/run.sh --lambda-only
+./bench_vs/run.sh --sp1-only
+```
+
+## What it measures
+
+Both provers execute the same program: iterative Fibonacci with `u64::wrapping_add`.
+Only **proving time** is compared (wall-clock, no recursion/compression on either side).
+
+- **Lambda VM**: Generates RISC-V assembly at runtime, assembles to ELF, proves via the CLI.
+- **SP1 v6**: Compiles a Rust guest program to RISC-V, proves via `sp1-sdk` core mode.
+
+## Output
+
+```
+=== Summary ===
+Program: Fibonacci (u64 wrapping)
+
+  n           Lambda VM       SP1 v6     Ratio
+  ---         ---------       ------     -----
+  1000          13.3s         12.4s      0.9x
+  10000         22.4s         12.9s      0.6x
+  100000       116.4s         14.7s      0.1x
+  300000          ...           ...       ...
+
+Green ratio = Lambda VM faster, Red = SP1 faster
+```
diff --git a/bench_vs/run.sh b/bench_vs/run.sh
new file mode 100755
index 000000000..1575e62a3
--- /dev/null
+++ b/bench_vs/run.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+# Benchmark: Lambda VM vs SP1 v6 — Fibonacci proving time comparison.
+#
+# Usage: ./bench_vs/run.sh [-n 1000 50000 100000] [--lambda-only | --sp1-only]
+#
+# Without -n, runs the default series: 1000 10000 100000 300000
+# With -n, runs the specified values (space-separated): -n 1000 50000
+#
+# Prerequisites:
+#   - Lambda VM CLI built: cargo build --release -p cli
+#   - SP1 toolchain installed: curl -L https://sp1up.succinct.xyz | bash && sp1up
+#   - clang with RISC-V target support (macOS Homebrew clang works)
+#   - ld.lld linker
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+TMP_DIR="/tmp/bench_fib"
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BOLD='\033[1m'
+NC='\033[0m'
+
+# --- Defaults ----------------------------------------------------------------
+DEFAULT_SERIES=(1000 10000 100000 300000)
+SERIES=()
+RUN_LAMBDA=true
+RUN_SP1=true
+
+# --- Parse args --------------------------------------------------------------
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -n) shift
+            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
+                SERIES+=("$1"); shift
+            done ;;
+        --lambda-only) RUN_SP1=false; shift ;;
+        --sp1-only) RUN_LAMBDA=false; shift ;;
+        -h|--help)
+            echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only]"
+            echo ""
+            echo "  -n N1 N2 ...    Fibonacci iteration counts (space-separated)"
+            echo "                  Default series: ${DEFAULT_SERIES[*]}"
+            echo "  --lambda-only   Only run Lambda VM benchmark"
+            echo "  --sp1-only      Only run SP1 benchmark"
+            exit 0
+            ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+if [ ${#SERIES[@]} -eq 0 ]; then
+    SERIES=("${DEFAULT_SERIES[@]}")
+fi
+
+echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}"
+echo -e "Series: ${YELLOW}${SERIES[*]}${NC}"
+echo ""
+
+rm -rf "$TMP_DIR" && mkdir -p "$TMP_DIR"
+
+# --- Pre-build ---------------------------------------------------------------
+
+CLI="$ROOT_DIR/target/release/cli"
+if $RUN_LAMBDA && [ ! -f "$CLI" ]; then
+    echo -e "${YELLOW}[Lambda VM] CLI not found, building...${NC}"
+    cargo build --release -p cli --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1
+fi
+
+SP1_BIN=""
+if $RUN_SP1; then
+    SP1_DIR="$SCRIPT_DIR/sp1/fibonacci"
+    echo -e "${GREEN}[SP1 v6] Building fibonacci prover...${NC}"
+    (cd "$SP1_DIR" && cargo build --release 2>&1 | tail -5)
+    SP1_BIN="$SP1_DIR/target/release/fibonacci-script"
+    if [ ! -f "$SP1_BIN" ]; then
+        echo -e "${RED}[SP1 v6] Build failed — fibonacci-script binary not found${NC}"
+        exit 1
+    fi
+fi
+
+# --- Run one benchmark --------------------------------------------------------
+
+# Arrays to collect results for the summary table
+declare -a RESULT_N RESULT_LAMBDA RESULT_SP1
+
+run_one() {
+    local N=$1
+    echo ""
+    echo -e "${BOLD}--- n=${N} ---${NC}"
+
+    local lambda_time=""
+    local sp1_time=""
+    local sp1_cycles=""
+
+    if $RUN_LAMBDA; then
+        # Generate assembly
+        cat > "$TMP_DIR/fib.s" <<ASM
+	.attribute	5, "rv64i2p1_m2p0"
+	.globl	main
+main:
+	li	t0, 0
+	li	t1, 1
+	li	a0, ${N}
+
+.loop:
+	add	t2, t0, t1
+	mv	t0, t1
+	mv	t1, t2
+	addi	a0, a0, -1
+	bnez	a0, .loop
+
+	mv	a0, t1
+	li	a7, 5
+	ecall
+ASM
+        clang --target=riscv64 -march=rv64im -mabi=lp64 -nostdlib \
+            -c "$TMP_DIR/fib.s" -o "$TMP_DIR/fib.o"
+        ld.lld -o "$TMP_DIR/fib.elf" "$TMP_DIR/fib.o" --entry=main
+
+        echo -e "  ${GREEN}[Lambda VM] Proving...${NC}"
+        LAMBDA_OUTPUT=$("$CLI" prove "$TMP_DIR/fib.elf" -o "$TMP_DIR/lambda_proof.bin" --time 2>/dev/null)
+        lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
+        echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC}"
+    fi
+
+    if $RUN_SP1; then
+        echo -e "  ${GREEN}[SP1 v6] Proving...${NC}"
+        SP1_OUTPUT=$("$SP1_BIN" "$N" 2>/dev/null)
+        sp1_time=$(echo "$SP1_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
+        sp1_cycles=$(echo "$SP1_OUTPUT" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*')
+        echo -e "  SP1 v6:    ${BOLD}${sp1_time}s${NC} (${sp1_cycles} cycles)"
+    fi
+
+    RESULT_N+=("$N")
+    RESULT_LAMBDA+=("${lambda_time:-n/a}")
+    RESULT_SP1+=("${sp1_time:-n/a}")
+}
+
+# --- Run series ---------------------------------------------------------------
+
+for N in "${SERIES[@]}"; do
+    run_one "$N"
+done
+
+# --- Summary table ------------------------------------------------------------
+
+echo ""
+echo -e "${BOLD}=== Summary ===${NC}"
+echo -e "Program: Fibonacci (u64 wrapping)"
+echo ""
+
+# Header
+if $RUN_LAMBDA && $RUN_SP1; then
+    printf "  %-10s  %12s  %12s  %8s\n" "n" "Lambda VM" "SP1 v6" "Ratio"
+    printf "  %-10s  %12s  %12s  %8s\n" "---" "---------" "------" "-----"
+elif $RUN_LAMBDA; then
+    printf "  %-10s  %12s\n" "n" "Lambda VM"
+    printf "  %-10s  %12s\n" "---" "---------"
+else
+    printf "  %-10s  %12s\n" "n" "SP1 v6"
+    printf "  %-10s  %12s\n" "---" "------"
+fi
+
+for i in "${!RESULT_N[@]}"; do
+    n="${RESULT_N[$i]}"
+    lt="${RESULT_LAMBDA[$i]}"
+    st="${RESULT_SP1[$i]}"
+
+    if $RUN_LAMBDA && $RUN_SP1; then
+        if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then
+            RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $st / $lt}")
+            if (( $(LC_NUMERIC=C awk "BEGIN {print ($st > $lt)}") )); then
+                RATIO="${GREEN}${RATIO}${NC}"
+            else
+                RATIO="${RED}${RATIO}${NC}"
+            fi
+            printf "  %-10s  %11ss  %11ss  " "$n" "$lt" "$st"
+            echo -e "$RATIO"
+        else
+            printf "  %-10s  %12s  %12s  %8s\n" "$n" "${lt}s" "${st}s" "-"
+        fi
+    elif $RUN_LAMBDA; then
+        printf "  %-10s  %11ss\n" "$n" "$lt"
+    else
+        printf "  %-10s  %11ss\n" "$n" "$st"
+    fi
+done
+
+echo ""
+echo -e "Green ratio = Lambda VM faster, Red = SP1 faster"
+echo "Raw data in $TMP_DIR/"
diff --git a/bench_vs/sp1/fibonacci/.tldr/daemon.pid b/bench_vs/sp1/fibonacci/.tldr/daemon.pid
new file mode 100644
index 000000000..10eda36c4
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/.tldr/daemon.pid
@@ -0,0 +1 @@
+39495
\ No newline at end of file
diff --git a/bench_vs/sp1/fibonacci/.tldr/status b/bench_vs/sp1/fibonacci/.tldr/status
new file mode 100644
index 000000000..ad50b5340
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/.tldr/status
@@ -0,0 +1 @@
+ready
\ No newline at end of file
diff --git a/bench_vs/sp1/fibonacci/.tldrignore b/bench_vs/sp1/fibonacci/.tldrignore
new file mode 100644
index 000000000..e01df83cb
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/.tldrignore
@@ -0,0 +1,84 @@
+# TLDR ignore patterns (gitignore syntax)
+# Auto-generated - review and customize for your project
+# Docs: https://git-scm.com/docs/gitignore
+
+# ===================
+# Dependencies
+# ===================
+node_modules/
+.venv/
+venv/
+env/
+__pycache__/
+.tox/
+.nox/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+vendor/
+Pods/
+
+# ===================
+# Build outputs
+# ===================
+dist/
+build/
+out/
+target/
+*.egg-info/
+*.whl
+*.pyc
+*.pyo
+
+# ===================
+# Binary/large files
+# ===================
+*.so
+*.dylib
+*.dll
+*.exe
+*.bin
+*.o
+*.a
+*.lib
+
+# ===================
+# IDE/editors
+# ===================
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+
+# ===================
+# Security (always exclude)
+# ===================
+.env
+.env.*
+*.pem
+*.key
+*.p12
+*.pfx
+credentials.*
+secrets.*
+
+# ===================
+# Version control
+# ===================
+.git/
+.hg/
+.svn/
+
+# ===================
+# OS files
+# ===================
+.DS_Store
+Thumbs.db
+
+# ===================
+# Project-specific
+# Add your custom patterns below
+# ===================
+# large_test_fixtures/
+# data/
diff --git a/bench_vs/sp1/fibonacci/Cargo.lock b/bench_vs/sp1/fibonacci/Cargo.lock
new file mode 100644
index 000000000..8825cad2e
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/Cargo.lock
@@ -0,0 +1,6182 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addchain"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b2e69442aa5628ea6951fa33e24efe8313f4321a91bd729fc2f75bdfc858570"
+dependencies = [
+ "num-bigint 0.3.3",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "addr2line"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "ansi_term"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "anstream"
+version = "0.6.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "ark-ff"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec847af850f44ad29048935519032c33da8aa03340876d351dfab5660d2966ba"
+dependencies = [
+ "ark-ff-asm",
+ "ark-ff-macros",
+ "ark-serialize",
+ "ark-std",
+ "derivative",
+ "digest",
+ "itertools 0.10.5",
+ "num-bigint 0.4.6",
+ "num-traits",
+ "paste",
+ "rustc_version",
+ "zeroize",
+]
+
+[[package]]
+name = "ark-ff-asm"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ed4aa4fe255d0bc6d79373f7e31d2ea147bcf486cba1be5ba7ea85abdb92348"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "ark-ff-macros"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7abe79b0e4288889c4574159ab790824d0033b9fdcb2a112a3182fac2e514565"
+dependencies = [
+ "num-bigint 0.4.6",
+ "num-traits",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "ark-serialize"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb7b85a02b83d2f22f89bd5cac66c9c89474240cb6207cb1efc16d098e822a5"
+dependencies = [
+ "ark-std",
+ "digest",
+ "num-bigint 0.4.6",
+]
+
+[[package]]
+name = "ark-std"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94893f1e0c6eeab764ade8dc4c0db24caf4fe7cbbaafc0eba0a9030f447b5185"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+]
+
+[[package]]
+name = "arrayref"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
+
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
+[[package]]
+name = "async-scoped"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4042078ea593edffc452eef14e99fdb2b120caa4ad9618bcdeabc4a023b98740"
+dependencies = [
+ "futures",
+ "pin-project",
+ "tokio",
+]
+
+[[package]]
+name = "async-stream"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "atomic"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340"
+dependencies = [
+ "bytemuck",
+]
+
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "axum"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper",
+ "tower 0.5.3",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "serde",
+ "windows-link",
+]
+
+[[package]]
+name = "base16ct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
+
+[[package]]
+name = "base64"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "base64ct"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06"
+
+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
+dependencies = [
+ "bitflags",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash 1.1.0",
+ "shlex",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "bitflags"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
+
+[[package]]
+name = "bitvec"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
+[[package]]
+name = "blake2"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "blake2b_simd"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b79834656f71332577234b50bfc009996f7449e0c056884e6a02492ded0ca2f3"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "constant_time_eq",
+]
+
+[[package]]
+name = "blake3"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "cc",
+ "cfg-if",
+ "constant_time_eq",
+ "cpufeatures",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array 0.14.9",
+]
+
+[[package]]
+name = "bls12_381"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3c196a77437e7cc2fb515ce413a6401291578b5afc8ecb29a3c7ab957f05941"
+dependencies = [
+ "ff 0.12.1",
+ "group 0.12.1",
+ "pairing",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
+[[package]]
+name = "byte-slice-cast"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7575182f7272186991736b70173b0ea045398f984bf5ebbb3804736ce1330c9d"
+
+[[package]]
+name = "bytemuck"
+version = "1.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
+dependencies = [
+ "bytemuck_derive",
+]
+
+[[package]]
+name = "bytemuck_derive"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+
+[[package]]
+name = "camino"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "cargo-platform"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cargo_metadata"
+version = "0.18.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d886547e41f740c616ae73108f6eb70afe6d940c7bc697cb30f13daec073037"
+dependencies = [
+ "camino",
+ "cargo-platform",
+ "semver",
+ "serde",
+ "serde_json",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "cc"
+version = "1.2.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2"
+dependencies = [
+ "find-msvc-tools",
+ "shlex",
+]
+
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
+[[package]]
+name = "chrono"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+dependencies = [
+ "iana-time-zone",
+ "num-traits",
+ "windows-link",
+]
+
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.55"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
+
+[[package]]
+name = "console"
+version = "0.15.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "once_cell",
+ "unicode-width 0.2.2",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "const-default"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b396d1f76d455557e1218ec8066ae14bba60b4b36ecd55577ba979f5db7ecaa"
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "const_format"
+version = "0.2.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7faa7469a93a566e9ccc1c73fe783b4a65c274c5ace346038dca9c39fe0030ad"
+dependencies = [
+ "const_format_proc_macros",
+]
+
+[[package]]
+name = "const_format_proc_macros"
+version = "0.2.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d57c2eccfb16dbac1f4e61e206105db5820c9d26c3c472bc17c774259ef7744"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-xid",
+]
+
+[[package]]
+name = "constant_time_eq"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "critical-section"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b"
+
+[[package]]
+name = "crossbeam"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8"
+dependencies = [
+ "crossbeam-channel",
+ "crossbeam-deque",
+ "crossbeam-epoch",
+ "crossbeam-queue",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "crypto-bigint"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
+dependencies = [
+ "generic-array 0.14.9",
+ "rand_core 0.6.4",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array 0.14.9",
+ "typenum",
+]
+
+[[package]]
+name = "dashu"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85b3e5ac1e23ff1995ef05b912e2b012a8784506987a2651552db2c73fb3d7e0"
+dependencies = [
+ "dashu-base",
+ "dashu-float",
+ "dashu-int",
+ "dashu-macros",
+ "dashu-ratio",
+ "rustversion",
+]
+
+[[package]]
+name = "dashu-base"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0b80bf6b85aa68c58ffea2ddb040109943049ce3fbdf4385d0380aef08ef289"
+
+[[package]]
+name = "dashu-float"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85078445a8dbd2e1bd21f04a816f352db8d333643f0c9b78ca7c3d1df71063e7"
+dependencies = [
+ "dashu-base",
+ "dashu-int",
+ "num-modular",
+ "num-order",
+ "rustversion",
+ "static_assertions",
+]
+
+[[package]]
+name = "dashu-int"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee99d08031ca34a4d044efbbb21dff9b8c54bb9d8c82a189187c0651ffdb9fbf"
+dependencies = [
+ "cfg-if",
+ "dashu-base",
+ "num-modular",
+ "num-order",
+ "rustversion",
+ "static_assertions",
+]
+
+[[package]]
+name = "dashu-macros"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93381c3ef6366766f6e9ed9cf09e4ef9dec69499baf04f0c60e70d653cf0ab10"
+dependencies = [
+ "dashu-base",
+ "dashu-float",
+ "dashu-int",
+ "dashu-ratio",
+ "paste",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+]
+
+[[package]]
+name = "dashu-ratio"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e33b04dd7ce1ccf8a02a69d3419e354f2bbfdf4eb911a0b7465487248764c9"
+dependencies = [
+ "dashu-base",
+ "dashu-float",
+ "dashu-int",
+ "num-modular",
+ "num-order",
+ "rustversion",
+]
+
+[[package]]
+name = "deepsize2"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86b5184084af9beed35eecbf4c36baf6e26b9dc47b61b74e02f930c72a58e71b"
+dependencies = [
+ "deepsize_derive2",
+ "hashbrown 0.14.5",
+]
+
+[[package]]
+name = "deepsize_derive2"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e0f8817865cacf3b93b943ca06b0fc5fd8e99eabfdb7ea5d296efcbc4afc4f69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "der"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
+dependencies = [
+ "const-oid",
+ "pem-rfc7468",
+ "zeroize",
+]
+
+[[package]]
+name = "deranged"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
+dependencies = [
+ "powerfmt",
+]
+
+[[package]]
+name = "derivative"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "derive-where"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef941ded77d15ca19b40374869ac6000af1c9f2a4c0f3d4c70926287e6364a8f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "derive_more"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
+dependencies = [
+ "derive_more-impl",
+]
+
+[[package]]
+name = "derive_more-impl"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "const-oid",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "dirs"
+version = "5.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
+dependencies = [
+ "dirs-sys",
+]
+
+[[package]]
+name = "dirs-sys"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
+dependencies = [
+ "libc",
+ "option-ext",
+ "redox_users",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "downcast-rs"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
+
+[[package]]
+name = "downloader"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ac1e888d6830712d565b2f3a974be3200be9296bc1b03db8251a4cbf18a4a34"
+dependencies = [
+ "digest",
+ "futures",
+ "rand 0.8.5",
+ "reqwest",
+ "thiserror 1.0.69",
+ "tokio",
+]
+
+[[package]]
+name = "dynasm"
+version = "3.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f7d4c414c94bc830797115b8e5f434d58e7e80cb42ba88508c14bc6ea270625"
+dependencies = [
+ "bitflags",
+ "byteorder",
+ "lazy_static",
+ "proc-macro-error2",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "dynasmrt"
+version = "3.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "602f7458a3859195fb840e6e0cce5f4330dd9dfbfece0edaf31fe427af346f55"
+dependencies = [
+ "byteorder",
+ "dynasm",
+ "fnv",
+ "memmap2",
+]
+
+[[package]]
+name = "ecdsa"
+version = "0.16.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
+dependencies = [
+ "der",
+ "digest",
+ "elliptic-curve",
+ "rfc6979",
+ "serdect",
+ "signature",
+ "spki",
+]
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "elf"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4445909572dbd556c457c849c4ca58623d84b27c8fff1e74b0b4227d8b90d17b"
+
+[[package]]
+name = "elliptic-curve"
+version = "0.13.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
+dependencies = [
+ "base16ct",
+ "crypto-bigint",
+ "digest",
+ "ff 0.13.1",
+ "generic-array 0.14.9",
+ "group 0.13.0",
+ "pem-rfc7468",
+ "pkcs8",
+ "rand_core 0.6.4",
+ "sec1",
+ "serdect",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "embedded-alloc"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f2de9133f68db0d4627ad69db767726c99ff8585272716708227008d3f1bddd"
+dependencies = [
+ "const-default",
+ "critical-section",
+ "linked_list_allocator",
+ "rlsf",
+]
+
+[[package]]
+name = "encode_unicode"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+
+[[package]]
+name = "enum-map"
+version = "2.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9"
+dependencies = [
+ "enum-map-derive",
+ "serde",
+]
+
+[[package]]
+name = "enum-map-derive"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "eventsource-stream"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74fef4569247a5f429d9156b9d0a2599914385dd189c539334c625d8099d90ab"
+dependencies = [
+ "futures-core",
+ "nom",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "eyre"
+version = "0.6.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cd915d99f24784cdc19fd37ef22b97e3ff0ae756c7e492e9fbfe897d61e2aec"
+dependencies = [
+ "indenter",
+ "once_cell",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+
+[[package]]
+name = "ff"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160"
+dependencies = [
+ "bitvec",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "ff"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393"
+dependencies = [
+ "bitvec",
+ "byteorder",
+ "ff_derive",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "ff_derive"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f10d12652036b0e99197587c6ba87a8fc3031986499973c030d8b44fcc151b60"
+dependencies = [
+ "addchain",
+ "num-bigint 0.3.3",
+ "num-integer",
+ "num-traits",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "fibonacci-program"
+version = "0.1.0"
+dependencies = [
+ "sp1-zkvm",
+]
+
+[[package]]
+name = "fibonacci-script"
+version = "0.1.0"
+dependencies = [
+ "sp1-build",
+ "sp1-sdk",
+]
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
+[[package]]
+name = "fixedbitset"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "funty"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
+
+[[package]]
+name = "futures"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "slab",
+]
+
+[[package]]
+name = "gcd"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d758ba1b47b00caf47f24925c0074ecb20d6dfcffe7f6d53395c0465674841a"
+
+[[package]]
+name = "gen_ops"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
+
+[[package]]
+name = "generic-array"
+version = "0.14.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2"
+dependencies = [
+ "typenum",
+ "version_check",
+ "zeroize",
+]
+
+[[package]]
+name = "generic-array"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96512db27971c2c3eece70a1e106fbe6c87760234e31e8f7e5634912fe52794a"
+dependencies = [
+ "serde",
+ "typenum",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "r-efi",
+ "wasip2",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasip2",
+ "wasip3",
+]
+
+[[package]]
+name = "gimli"
+version = "0.32.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7"
+
+[[package]]
+name = "glob"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
+
+[[package]]
+name = "group"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
+dependencies = [
+ "ff 0.12.1",
+ "memuse",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "group"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
+dependencies = [
+ "ff 0.13.1",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "h2"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "indexmap 2.13.0",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "halo2"
+version = "0.1.0-beta.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a23c779b38253fe1538102da44ad5bd5378495a61d2c4ee18d64eaa61ae5995"
+dependencies = [
+ "halo2_proofs",
+]
+
+[[package]]
+name = "halo2_proofs"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e925780549adee8364c7f2b685c753f6f3df23bde520c67416e93bf615933760"
+dependencies = [
+ "blake2b_simd",
+ "ff 0.12.1",
+ "group 0.12.1",
+ "pasta_curves 0.4.1",
+ "rand_core 0.6.4",
+ "rayon",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+ "serde",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "hmac"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "http"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a"
+dependencies = [
+ "bytes",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "hyper"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "pin-utils",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.27.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+ "webpki-roots",
+]
+
+[[package]]
+name = "hyper-timeout"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
+dependencies = [
+ "hyper",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "ipnet",
+ "libc",
+ "percent-encoding",
+ "pin-project-lite",
+ "socket2 0.6.2",
+ "tokio",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.65"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "log",
+ "wasm-bindgen",
+ "windows-core 0.62.2",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "icu_collections"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
+
+[[package]]
+name = "icu_properties"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
+
+[[package]]
+name = "icu_provider"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
+[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
+[[package]]
+name = "impl-trait-for-tuples"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "indenter"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "964de6e86d545b246d84badc0fef527924ace5134f30641c203ef52ba83f58d5"
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.16.1",
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "indicatif"
+version = "0.17.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
+dependencies = [
+ "console",
+ "number_prefix",
+ "portable-atomic",
+ "unicode-width 0.2.2",
+ "web-time",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
+
+[[package]]
+name = "iri-string"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
+
+[[package]]
+name = "js-sys"
+version = "0.3.90"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "jubjub"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a575df5f985fe1cd5b2b05664ff6accfc46559032b954529fd225a2168d27b0f"
+dependencies = [
+ "bitvec",
+ "bls12_381",
+ "ff 0.12.1",
+ "group 0.12.1",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "k256"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b"
+dependencies = [
+ "cfg-if",
+ "ecdsa",
+ "elliptic-curve",
+ "once_cell",
+ "serdect",
+ "sha2",
+ "signature",
+]
+
+[[package]]
+name = "keccak"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653"
+dependencies = [
+ "cpufeatures",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+dependencies = [
+ "spin",
+]
+
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
+[[package]]
+name = "libc"
+version = "0.2.182"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112"
+
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
+[[package]]
+name = "libm"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
+
+[[package]]
+name = "libredox"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+dependencies = [
+ "bitflags",
+ "libc",
+]
+
+[[package]]
+name = "linked_list_allocator"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afa463f5405ee81cdb9cc2baf37e08ec7e4c8209442b5d72c04cfb2cd6e6286"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
+[[package]]
+name = "litemap"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
+
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+
+[[package]]
+name = "lru"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
+dependencies = [
+ "hashbrown 0.15.5",
+]
+
+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
+[[package]]
+name = "matchers"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
+dependencies = [
+ "regex-automata",
+]
+
+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+
+[[package]]
+name = "memfd"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad38eb12aea514a0466ea40a80fd8cc83637065948eb4a426e4aa46261175227"
+dependencies = [
+ "rustix",
+]
+
+[[package]]
+name = "memmap2"
+version = "0.9.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "memuse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d97bbf43eb4f088f8ca469930cde17fa036207c9a5e02ccc5107c4e8b17c964"
+
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "mio"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "mti"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9563a7d5556636e74bbd8773241fbcbc5c89b9f6bfdc97b29b56e740c2c74b9"
+dependencies = [
+ "typeid_prefix",
+ "typeid_suffix",
+]
+
+[[package]]
+name = "multimap"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "ntapi"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "nu-ansi-term"
+version = "0.50.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint 0.4.6",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f6f7833f2cbf2360a6cfd58cd41a53aa7a90bd4c202f5b1c7dd2ed73c57b2c3"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-conv"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050"
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-modular"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17bb261bf36fa7d83f4c294f834e91256769097b3cb505d44831e0a179ac647f"
+
+[[package]]
+name = "num-order"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "537b596b97c40fcf8056d153049eb22f481c17ebce72a513ec9286e4986d1bb6"
+dependencies = [
+ "num-modular",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint 0.4.6",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "num_cpus"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
+name = "num_enum"
+version = "0.5.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9"
+dependencies = [
+ "num_enum_derive",
+]
+
+[[package]]
+name = "num_enum_derive"
+version = "0.5.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799"
+dependencies = [
+ "proc-macro-crate 1.3.1",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "number_prefix"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
+
+[[package]]
+name = "object"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "opentelemetry"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "option-ext"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+
+[[package]]
+name = "p256"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
+dependencies = [
+ "ecdsa",
+ "elliptic-curve",
+ "primeorder",
+ "sha2",
+]
+
+[[package]]
+name = "p3-air"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d275c27bb81483d669709d7244ce333b51f9743af2474cdc09ba1509f5c290db"
+dependencies = [
+ "p3-field",
+ "p3-matrix",
+ "serde",
+]
+
+[[package]]
+name = "p3-baby-bear"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95a083928c9055f2171e3cb0bb4767969e4955473e71ba61affe46d7a3c98a89"
+dependencies = [
+ "num-bigint 0.4.6",
+ "p3-field",
+ "p3-mds",
+ "p3-poseidon2",
+ "p3-symmetric",
+ "rand 0.8.5",
+ "serde",
+]
+
+[[package]]
+name = "p3-bn254-fr"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9abf208fbfe540d6e2a6caaa2a9a345b1c8cb23ffdcdfcc6987244525d4fc821"
+dependencies = [
+ "ff 0.13.1",
+ "num-bigint 0.4.6",
+ "p3-field",
+ "p3-poseidon2",
+ "p3-symmetric",
+ "rand 0.8.5",
+ "serde",
+]
+
+[[package]]
+name = "p3-challenger"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42b725b453bbb35117a1abf0ddfd900b0676063d6e4231e0fa6bb0d76018d8ad"
+dependencies = [
+ "p3-field",
+ "p3-maybe-rayon",
+ "p3-symmetric",
+ "p3-util",
+ "serde",
+ "tracing",
+]
+
+[[package]]
+name = "p3-commit"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "518695b56f450f9223bdd8994dda87916b97ebf1d1c03c956807e78522fdb333"
+dependencies = [
+ "itertools 0.12.1",
+ "p3-challenger",
+ "p3-field",
+ "p3-matrix",
+ "p3-util",
+ "serde",
+]
+
+[[package]]
+name = "p3-dft"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56a1f81101bff744b7ebba7f4497e917a2c6716d6e62736e4a56e555a2d98cb7"
+dependencies = [
+ "p3-field",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-util",
+ "tracing",
+]
+
+[[package]]
+name = "p3-field"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36459d4acb03d08097d713f336c7393990bb489ab19920d4f68658c7a5c10968"
+dependencies = [
+ "itertools 0.12.1",
+ "num-bigint 0.4.6",
+ "num-traits",
+ "p3-util",
+ "rand 0.8.5",
+ "serde",
+]
+
+[[package]]
+name = "p3-fri"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e2529a174a04189cfe705d756fb0e33d3c8fb06b167b521ddb877c78407f12a"
+dependencies = [
+ "itertools 0.12.1",
+ "p3-challenger",
+ "p3-commit",
+ "p3-dft",
+ "p3-field",
+ "p3-interpolation",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-util",
+ "serde",
+ "tracing",
+]
+
+[[package]]
+name = "p3-interpolation"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6662049877c802155cdb4863db59899469fc3565d22d9047e1bd22d6b71f28e5"
+dependencies = [
+ "p3-field",
+ "p3-matrix",
+ "p3-util",
+]
+
+[[package]]
+name = "p3-keccak-air"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "169c96f8f0aaa9042872fdb6bbae0477fd1363b87c23877dbb2ec7fb46f8fcfa"
+dependencies = [
+ "p3-air",
+ "p3-field",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-util",
+ "tracing",
+]
+
+[[package]]
+name = "p3-koala-bear"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb1f52bcb6be38bdc8fa6b38b3434d4eedd511f361d4249fd798c6a5ef817b40"
+dependencies = [
+ "num-bigint 0.4.6",
+ "p3-field",
+ "p3-mds",
+ "p3-poseidon2",
+ "p3-symmetric",
+ "rand 0.8.5",
+ "serde",
+]
+
+[[package]]
+name = "p3-matrix"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5583e9cd136a4095a25c41a9edfdcce2dfae58ef01639317813bdbbd5b55c583"
+dependencies = [
+ "itertools 0.12.1",
+ "p3-field",
+ "p3-maybe-rayon",
+ "p3-util",
+ "rand 0.8.5",
+ "serde",
+ "tracing",
+]
+
+[[package]]
+name = "p3-maybe-rayon"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e524d47a49fb4265611303339c4ef970d892817b006cc330dad18afb91e411b1"
+dependencies = [
+ "rayon",
+]
+
+[[package]]
+name = "p3-mds"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f6cb8edcb276033d43769a3725570c340d2ed6f35c3cca4cddeee07718fa376"
+dependencies = [
+ "itertools 0.12.1",
+ "p3-dft",
+ "p3-field",
+ "p3-matrix",
+ "p3-symmetric",
+ "p3-util",
+ "rand 0.8.5",
+]
+
+[[package]]
+name = "p3-merkle-tree"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e8bc3c224fc70d22f9556393e1482b52539e11c7b82ac6933c436fd82738f4"
+dependencies = [
+ "itertools 0.12.1",
+ "p3-commit",
+ "p3-field",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-symmetric",
+ "p3-util",
+ "serde",
+ "tracing",
+]
+
+[[package]]
+name = "p3-poseidon2"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a26197df2097b98ab7038d59a01e1fe1a0f545e7e04aa9436b2454b1836654f"
+dependencies = [
+ "gcd",
+ "p3-field",
+ "p3-mds",
+ "p3-symmetric",
+ "rand 0.8.5",
+ "serde",
+]
+
+[[package]]
+name = "p3-symmetric"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a1d3b5202096bca57cde912fbbb9cbaedaf5ac7c42a924c7166b98709d64d21"
+dependencies = [
+ "itertools 0.12.1",
+ "p3-field",
+ "serde",
+]
+
+[[package]]
+name = "p3-uni-stark"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fef1cdb8285a7adb78df991852d3b66d3b25cf6ffc34f528505d1aee49bdb968"
+dependencies = [
+ "itertools 0.12.1",
+ "p3-air",
+ "p3-challenger",
+ "p3-commit",
+ "p3-dft",
+ "p3-field",
+ "p3-matrix",
+ "p3-maybe-rayon",
+ "p3-util",
+ "serde",
+ "tracing",
+]
+
+[[package]]
+name = "p3-util"
+version = "0.3.2-succinct"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec5f0388aa6d935ca3a17444086120f393f0b2f0816010b5ff95998c1c4095e3"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "pairing"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135590d8bdba2b31346f9cd1fb2a912329f5135e832a4f422942eb6ead8b6b3b"
+dependencies = [
+ "group 0.12.1",
+]
+
+[[package]]
+name = "parity-scale-codec"
+version = "3.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "799781ae679d79a948e13d4824a40970bfa500058d245760dd857301059810fa"
+dependencies = [
+ "arrayvec",
+ "byte-slice-cast",
+ "const_format",
+ "impl-trait-for-tuples",
+ "parity-scale-codec-derive",
+ "rustversion",
+]
+
+[[package]]
+name = "parity-scale-codec-derive"
+version = "3.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34b4653168b563151153c9e4c08ebed57fb8262bebfa79711552fa983c623e7a"
+dependencies = [
+ "proc-macro-crate 3.4.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "parking_lot"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-link",
+]
+
+[[package]]
+name = "pasta_curves"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cc65faf8e7313b4b1fbaa9f7ca917a0eed499a9663be71477f87993604341d8"
+dependencies = [
+ "blake2b_simd",
+ "ff 0.12.1",
+ "group 0.12.1",
+ "lazy_static",
+ "rand 0.8.5",
+ "static_assertions",
+ "subtle",
+]
+
+[[package]]
+name = "pasta_curves"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e57598f73cc7e1b2ac63c79c517b31a0877cd7c402cdcaa311b5208de7a095"
+dependencies = [
+ "blake2b_simd",
+ "ff 0.13.1",
+ "group 0.13.0",
+ "lazy_static",
+ "rand 0.8.5",
+ "static_assertions",
+ "subtle",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
+
+[[package]]
+name = "petgraph"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.13.0",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der",
+ "spki",
+]
+
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
+[[package]]
+name = "potential_utf"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
+dependencies = [
+ "zerovec",
+]
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "primeorder"
+version = "0.13.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
+dependencies = [
+ "elliptic-curve",
+]
+
+[[package]]
+name = "proc-macro-crate"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919"
+dependencies = [
+ "once_cell",
+ "toml_edit 0.19.15",
+]
+
+[[package]]
+name = "proc-macro-crate"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983"
+dependencies = [
+ "toml_edit 0.23.10+spec-1.0.0",
+]
+
+[[package]]
+name = "proc-macro-error-attr2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
+name = "proc-macro-error2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802"
+dependencies = [
+ "proc-macro-error-attr2",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "prost"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
+dependencies = [
+ "bytes",
+ "prost-derive",
+]
+
+[[package]]
+name = "prost-build"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
+dependencies = [
+ "heck",
+ "itertools 0.14.0",
+ "log",
+ "multimap",
+ "once_cell",
+ "petgraph",
+ "prettyplease",
+ "prost",
+ "prost-types",
+ "regex",
+ "syn 2.0.117",
+ "tempfile",
+]
+
+[[package]]
+name = "prost-derive"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d"
+dependencies = [
+ "anyhow",
+ "itertools 0.14.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
+dependencies = [
+ "prost",
+]
+
+[[package]]
+name = "quinn"
+version = "0.11.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash 2.1.1",
+ "rustls",
+ "socket2 0.6.2",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
+dependencies = [
+ "bytes",
+ "getrandom 0.3.4",
+ "lru-slab",
+ "rand 0.9.2",
+ "ring",
+ "rustc-hash 2.1.1",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.18",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2 0.6.2",
+ "tracing",
+ "windows-sys 0.60.2",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "radium"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.17",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
+[[package]]
+name = "range-set-blaze"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
+dependencies = [
+ "gen_ops",
+ "itertools 0.12.1",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "rayon"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "rayon-scan"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f87cc11a0140b4b0da0ffc889885760c61b13672d80a908920b2c0df078fa14"
+dependencies = [
+ "rayon",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "redox_users"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
+dependencies = [
+ "getrandom 0.2.17",
+ "libredox",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
+[[package]]
+name = "reqwest"
+version = "0.12.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "percent-encoding",
+ "pin-project-lite",
+ "quinn",
+ "rustls",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "tower 0.5.3",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams",
+ "web-sys",
+ "webpki-roots",
+]
+
+[[package]]
+name = "rfc6979"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
+dependencies = [
+ "hmac",
+ "subtle",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.17",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rlsf"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1646a59a9734b8b7a0ac51689388a60fe1625d4b956348e9de07591a1478457a"
+dependencies = [
+ "cfg-if",
+ "const-default",
+ "libc",
+ "rustversion",
+ "svgbobdoc",
+]
+
+[[package]]
+name = "rrs-succinct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efd079cd303257a4cb4e5aadfa79a7fe23f3c8301aa4740ccc3a99673485a352"
+dependencies = [
+ "downcast-rs",
+ "num_enum",
+ "paste",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d"
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+
+[[package]]
+name = "rustc-hex"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e75f6a532d0fd9f7f13144f392b6ad56a32696bfcd9c78f797f16bbb6f072d6"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustix"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls"
+version = "0.23.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
+dependencies = [
+ "log",
+ "once_cell",
+ "ring",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
+dependencies = [
+ "web-time",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.103.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53"
+dependencies = [
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
+[[package]]
+name = "scale-info"
+version = "2.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346a3b32eba2640d17a9cb5927056b08f3de90f65b72fe09402c2ad07d684d0b"
+dependencies = [
+ "cfg-if",
+ "derive_more",
+ "parity-scale-codec",
+ "scale-info-derive",
+]
+
+[[package]]
+name = "scale-info-derive"
+version = "2.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6630024bf739e2179b91fb424b28898baf819414262c5d376677dbff1fe7ebf"
+dependencies = [
+ "proc-macro-crate 3.4.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "scc"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc"
+dependencies = [
+ "sdd",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "sdd"
+version = "3.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca"
+
+[[package]]
+name = "sec1"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
+dependencies = [
+ "base16ct",
+ "der",
+ "generic-array 0.14.9",
+ "pkcs8",
+ "serdect",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
+dependencies = [
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_arrays"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94a16b99c5ea4fe3daccd14853ad260ec00ea043b2708d1fd1da3106dcd8d9df"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serdect"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a84f14a19e9a014bb9f4512488d9829a68e04ecabffb0f9904cd1ace94598177"
+dependencies = [
+ "base16ct",
+ "serde",
+]
+
+[[package]]
+name = "serial_test"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f"
+dependencies = [
+ "futures-executor",
+ "futures-util",
+ "log",
+ "once_cell",
+ "parking_lot",
+ "scc",
+ "serial_test_derive",
+]
+
+[[package]]
+name = "serial_test_derive"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "sha1_smol"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d"
+
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sha3"
+version = "0.10.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60"
+dependencies = [
+ "digest",
+ "keccak",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
+dependencies = [
+ "errno",
+ "libc",
+]
+
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "digest",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
+[[package]]
+name = "slop-air"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c27279ff5aa6177ad08fd2bcde31f34fc98ea633666a835a4fad3502824dce26"
+dependencies = [
+ "p3-air",
+]
+
+[[package]]
+name = "slop-algebra"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1d38320f4622a9f07907b8529d031066a75a6e741ea2ef17ed1e16047f5bd77"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-field",
+ "serde",
+]
+
+[[package]]
+name = "slop-alloc"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51cdc27df6c9fe163f68b724d2b00b4edd24e66bf38a06e7bc473e50e36c3799"
+dependencies = [
+ "serde",
+ "slop-algebra",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "slop-baby-bear"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3500e0ad37b85d0dfd792c59615abe9741fe519c78bdc3928dc3fbbab57c2b5b"
+dependencies = [
+ "lazy_static",
+ "p3-baby-bear",
+ "serde",
+ "slop-algebra",
+ "slop-challenger",
+ "slop-poseidon2",
+ "slop-symmetric",
+]
+
+[[package]]
+name = "slop-basefold"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbc3d75bc5651b46f135ac04140fefa4a9a4143440edcccc1e9d8e4d3dd05715"
+dependencies = [
+ "derive-where",
+ "itertools 0.14.0",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-baby-bear",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-koala-bear",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-primitives",
+ "slop-tensor",
+ "slop-utils",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "slop-basefold-prover"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17834564e6d40554b7a635db4fb8cfd61a19f7cc3438549d53b40a4e9e157b1f"
+dependencies = [
+ "derive-where",
+ "itertools 0.14.0",
+ "rand 0.8.5",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-baby-bear",
+ "slop-basefold",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-commit",
+ "slop-dft",
+ "slop-fri",
+ "slop-futures",
+ "slop-koala-bear",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-tensor",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "slop-bn254"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91cb09414adf73264281cf490e2bd23be7d28415e4e729a275029ebc1a0acf6a"
+dependencies = [
+ "ff 0.13.1",
+ "p3-bn254-fr",
+ "serde",
+ "slop-algebra",
+ "slop-challenger",
+ "slop-poseidon2",
+ "slop-symmetric",
+ "zkhash",
+]
+
+[[package]]
+name = "slop-challenger"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "395ae2cad21ea894c614166f48dce58135be2aa13ab04971cbe6e31b85ad9902"
+dependencies = [
+ "futures",
+ "p3-challenger",
+ "serde",
+ "slop-algebra",
+ "slop-symmetric",
+]
+
+[[package]]
+name = "slop-commit"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b0ed38f216999ad9211f384f59d20ff0f70b88010a2856b7e0dde4d23b8cde8"
+dependencies = [
+ "p3-commit",
+ "serde",
+ "slop-alloc",
+]
+
+[[package]]
+name = "slop-dft"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9211d3c0ff3794563ffc7c3ffa3a5cde8becee5f6e831fb94552a607e320fd23"
+dependencies = [
+ "p3-dft",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-matrix",
+ "slop-tensor",
+]
+
+[[package]]
+name = "slop-fri"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c90e0689aa9b4f67700d6a100fd02e6e0f17de1eb806bb78e2074462b7b6201"
+dependencies = [
+ "p3-fri",
+]
+
+[[package]]
+name = "slop-futures"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e68c32cc3be82a37b69af3d1e4effb509839d9c2fab7457c41c3d50dd32a842e"
+dependencies = [
+ "crossbeam",
+ "futures",
+ "pin-project",
+ "rayon",
+ "thiserror 1.0.69",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "slop-jagged"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bce3113032254921bef5e071216f35519ea08730a1f44f2ade4be7e0c305631a"
+dependencies = [
+ "derive-where",
+ "futures",
+ "itertools 0.14.0",
+ "num_cpus",
+ "rand 0.8.5",
+ "rayon",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-baby-bear",
+ "slop-basefold",
+ "slop-basefold-prover",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-commit",
+ "slop-futures",
+ "slop-koala-bear",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-stacked",
+ "slop-sumcheck",
+ "slop-symmetric",
+ "slop-tensor",
+ "slop-utils",
+ "thiserror 1.0.69",
+ "tracing",
+]
+
+[[package]]
+name = "slop-keccak-air"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27bef24890c8a39c8caf484afa97060c41466455f9102283902ff68bbfd7f841"
+dependencies = [
+ "p3-keccak-air",
+]
+
+[[package]]
+name = "slop-koala-bear"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb1f80eb2a075f550c7e9abed16e03c727f54108f587a465d023ec810100a70f"
+dependencies = [
+ "lazy_static",
+ "p3-koala-bear",
+ "serde",
+ "slop-algebra",
+ "slop-challenger",
+ "slop-poseidon2",
+ "slop-symmetric",
+]
+
+[[package]]
+name = "slop-matrix"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba089c19d768cc452b511f754958254892caed33d7a8d744ffc67377111e4908"
+dependencies = [
+ "p3-matrix",
+]
+
+[[package]]
+name = "slop-maybe-rayon"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e968db301ffe72ca69fe7a8b61e0f5f8d3b22a12475c5c9b99141e60ad8956d"
+dependencies = [
+ "p3-maybe-rayon",
+]
+
+[[package]]
+name = "slop-merkle-tree"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51f2721f11f0242bcc36e3cdc70cf31fdbb936b2731f1d059929b436fe002fa8"
+dependencies = [
+ "derive-where",
+ "ff 0.13.1",
+ "itertools 0.14.0",
+ "p3-merkle-tree",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-baby-bear",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-commit",
+ "slop-futures",
+ "slop-koala-bear",
+ "slop-matrix",
+ "slop-poseidon2",
+ "slop-symmetric",
+ "slop-tensor",
+ "thiserror 1.0.69",
+ "zkhash",
+]
+
+[[package]]
+name = "slop-multilinear"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf5d26d5fc7751af8de644225f51c178e2af42e2b762496ba9a00fc65677617d"
+dependencies = [
+ "derive-where",
+ "futures",
+ "num_cpus",
+ "rand 0.8.5",
+ "rayon",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-challenger",
+ "slop-commit",
+ "slop-futures",
+ "slop-matrix",
+ "slop-tensor",
+]
+
+[[package]]
+name = "slop-poseidon2"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f26080f555f777867a68eb18fa34d7c321e9f0250ace86ef3f0cb0151157133"
+dependencies = [
+ "p3-poseidon2",
+]
+
+[[package]]
+name = "slop-primitives"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a606113e4aac9024483e283ab6ef7afc4ebd5d5ca0915b713f8d1d23aa1687bd"
+dependencies = [
+ "slop-algebra",
+]
+
+[[package]]
+name = "slop-stacked"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9f807203f2d5505ab4b6f44a99d575aaf0d46a39b16af42397b310063667ee8"
+dependencies = [
+ "derive-where",
+ "futures",
+ "itertools 0.14.0",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-basefold",
+ "slop-basefold-prover",
+ "slop-challenger",
+ "slop-commit",
+ "slop-futures",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-tensor",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "slop-sumcheck"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4045fc34ee3aef98a67baf650bc462327bbc95ca69e0265ba56f9b6cfc2515b"
+dependencies = [
+ "futures",
+ "itertools 0.14.0",
+ "rayon",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-baby-bear",
+ "slop-challenger",
+ "slop-multilinear",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "slop-symmetric"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1eb38a05aacd00d2362bb5f51c00f3e9cb82b7091d7b862ac239171d5a3dcad4"
+dependencies = [
+ "p3-symmetric",
+]
+
+[[package]]
+name = "slop-tensor"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ba4b24bc6985c0215c459e723228c0da10a7b35541c8ccb1b533146d49df49f"
+dependencies = [
+ "arrayvec",
+ "derive-where",
+ "itertools 0.14.0",
+ "rand 0.8.5",
+ "rayon",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-futures",
+ "slop-matrix",
+ "thiserror 1.0.69",
+ "transpose",
+]
+
+[[package]]
+name = "slop-uni-stark"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f7e27e2c06b9504dbb5eb3cbee929b651f9da6ea5112dd4004ce1cc3b8e586"
+dependencies = [
+ "p3-uni-stark",
+]
+
+[[package]]
+name = "slop-utils"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97b2e9bd1717e7848d44ce8f5d3eb92209c658a0e934b02af7a5dad4f70271a6"
+dependencies = [
+ "p3-util",
+ "tracing-forest",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "slop-whir"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1393446116ca30b7685a5ca9bb50bb9095b8074bdd63941a8d64b3c22cc14a8"
+dependencies = [
+ "derive-where",
+ "futures",
+ "itertools 0.14.0",
+ "rand 0.8.5",
+ "rayon",
+ "serde",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-baby-bear",
+ "slop-basefold",
+ "slop-challenger",
+ "slop-commit",
+ "slop-dft",
+ "slop-jagged",
+ "slop-koala-bear",
+ "slop-matrix",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-stacked",
+ "slop-tensor",
+ "slop-utils",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "snowbridge-amcl"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460a9ed63cdf03c1b9847e8a12a5f5ba19c4efd5869e4a737e05be25d7c427e5"
+dependencies = [
+ "parity-scale-codec",
+ "scale-info",
+]
+
+[[package]]
+name = "socket2"
+version = "0.5.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "socket2"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0"
+dependencies = [
+ "libc",
+ "windows-sys 0.60.2",
+]
+
+[[package]]
+name = "sp1-build"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c469c584f9a1f0f7a64283c94c074d1edb0446a2ff76a7f60f7e4ce804f4b2c"
+dependencies = [
+ "anyhow",
+ "cargo_metadata",
+ "chrono",
+ "clap",
+ "dirs",
+ "sp1-primitives",
+]
+
+[[package]]
+name = "sp1-core-executor"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c3d6a58470da2280a8bf14457721b1f560e4d0b8e67c1067e1ce78fdcc5fde3"
+dependencies = [
+ "bincode",
+ "bytemuck",
+ "cfg-if",
+ "clap",
+ "deepsize2",
+ "elf",
+ "enum-map",
+ "eyre",
+ "hashbrown 0.14.5",
+ "hex",
+ "itertools 0.14.0",
+ "memmap2",
+ "num",
+ "rrs-succinct",
+ "serde",
+ "serde_arrays",
+ "serde_json",
+ "slop-air",
+ "slop-algebra",
+ "slop-maybe-rayon",
+ "slop-symmetric",
+ "sp1-curves",
+ "sp1-hypercube",
+ "sp1-jit",
+ "sp1-primitives",
+ "strum",
+ "subenum",
+ "thiserror 1.0.69",
+ "tiny-keccak",
+ "tracing",
+ "typenum",
+ "vec_map",
+]
+
+[[package]]
+name = "sp1-core-machine"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7704a5542a77a0b98e483bd87256362658a91b7b70a6864c5c2c92fbbc7a5a71"
+dependencies = [
+ "bincode",
+ "cfg-if",
+ "enum-map",
+ "futures",
+ "generic-array 1.1.0",
+ "hashbrown 0.14.5",
+ "itertools 0.14.0",
+ "num",
+ "num_cpus",
+ "rayon",
+ "rayon-scan",
+ "rrs-succinct",
+ "serde",
+ "serde_json",
+ "slop-air",
+ "slop-algebra",
+ "slop-challenger",
+ "slop-futures",
+ "slop-keccak-air",
+ "slop-matrix",
+ "slop-maybe-rayon",
+ "slop-uni-stark",
+ "snowbridge-amcl",
+ "sp1-core-executor",
+ "sp1-curves",
+ "sp1-derive",
+ "sp1-hypercube",
+ "sp1-jit",
+ "sp1-primitives",
+ "static_assertions",
+ "strum",
+ "sysinfo",
+ "tempfile",
+ "thiserror 1.0.69",
+ "tokio",
+ "tracing",
+ "tracing-forest",
+ "tracing-subscriber",
+ "typenum",
+]
+
+[[package]]
+name = "sp1-cuda"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03e57b85361e6fcc7d5405867eb036c10969518fb8173e3d6744574f97954766"
+dependencies = [
+ "bincode",
+ "bytes",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "sp1-core-executor",
+ "sp1-core-machine",
+ "sp1-primitives",
+ "sp1-prover",
+ "sp1-prover-types",
+ "thiserror 1.0.69",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "sp1-curves"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eabe28a711559675f1addb4a529159c5db383a79f62819dffd4349ccb4e979e"
+dependencies = [
+ "cfg-if",
+ "dashu",
+ "elliptic-curve",
+ "generic-array 1.1.0",
+ "itertools 0.14.0",
+ "k256",
+ "num",
+ "p256",
+ "serde",
+ "slop-algebra",
+ "snowbridge-amcl",
+ "sp1-primitives",
+ "typenum",
+]
+
+[[package]]
+name = "sp1-derive"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09bb8b5d4eade7611018a28063f32a73f5c59bc1b29a8e517413dca66084ca0f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "sp1-hypercube"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03ac19804d8b1bf955fb2fd7722c9b12bcbe9343a0e4b88ecf607a3e85ae63cd"
+dependencies = [
+ "arrayref",
+ "deepsize2",
+ "derive-where",
+ "futures",
+ "hashbrown 0.14.5",
+ "itertools 0.14.0",
+ "num-bigint 0.4.6",
+ "num-traits",
+ "num_cpus",
+ "rayon",
+ "rayon-scan",
+ "serde",
+ "slop-air",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-basefold",
+ "slop-basefold-prover",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-commit",
+ "slop-futures",
+ "slop-jagged",
+ "slop-koala-bear",
+ "slop-matrix",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-poseidon2",
+ "slop-stacked",
+ "slop-sumcheck",
+ "slop-symmetric",
+ "slop-tensor",
+ "slop-uni-stark",
+ "slop-whir",
+ "sp1-derive",
+ "sp1-primitives",
+ "strum",
+ "thiserror 1.0.69",
+ "thousands",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "sp1-jit"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb1eff715595ef7059f2db3845f941bbaf5c2635e2b6b0fe0b0d982d4422f6a"
+dependencies = [
+ "dynasmrt",
+ "hashbrown 0.14.5",
+ "memfd",
+ "memmap2",
+ "serde",
+ "tracing",
+ "uuid",
+]
+
+[[package]]
+name = "sp1-lib"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27c49bc98323d52ec8bef7ae7db15fa095182edfdc2e7d9123f0c57173014e48"
+dependencies = [
+ "bincode",
+ "serde",
+ "sp1-primitives",
+]
+
+[[package]]
+name = "sp1-primitives"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04953c36911214897091107e2a3443fcf531892b0883ce57d4a2eea65d28c72b"
+dependencies = [
+ "bincode",
+ "blake3",
+ "elf",
+ "hex",
+ "itertools 0.14.0",
+ "lazy_static",
+ "num-bigint 0.4.6",
+ "serde",
+ "sha2",
+ "slop-algebra",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-koala-bear",
+ "slop-poseidon2",
+ "slop-primitives",
+ "slop-symmetric",
+]
+
+[[package]]
+name = "sp1-prover"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a47f86dbe432038fed00fd869b0937d11b87abdb0e34a676aaeb5a723f1e31e3"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "clap",
+ "dirs",
+ "downloader",
+ "either",
+ "enum-map",
+ "eyre",
+ "futures",
+ "hashbrown 0.14.5",
+ "hex",
+ "indicatif",
+ "itertools 0.14.0",
+ "lru",
+ "mti",
+ "num-bigint 0.4.6",
+ "opentelemetry",
+ "pin-project",
+ "rand 0.8.5",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "serial_test",
+ "sha2",
+ "slop-air",
+ "slop-algebra",
+ "slop-basefold",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-futures",
+ "slop-jagged",
+ "slop-multilinear",
+ "slop-stacked",
+ "slop-symmetric",
+ "sp1-core-executor",
+ "sp1-core-machine",
+ "sp1-derive",
+ "sp1-hypercube",
+ "sp1-jit",
+ "sp1-primitives",
+ "sp1-prover-types",
+ "sp1-recursion-circuit",
+ "sp1-recursion-compiler",
+ "sp1-recursion-executor",
+ "sp1-recursion-gnark-ffi",
+ "sp1-recursion-machine",
+ "sp1-verifier",
+ "static_assertions",
+ "sysinfo",
+ "tempfile",
+ "thiserror 1.0.69",
+ "tokio",
+ "tonic",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "sp1-prover-types"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9de603ae06be908cca9c4e4117b5e3aceaf4e1b6b9a98b3892ec15a4a865c80f"
+dependencies = [
+ "anyhow",
+ "async-scoped",
+ "bincode",
+ "chrono",
+ "futures-util",
+ "hashbrown 0.14.5",
+ "mti",
+ "prost",
+ "serde",
+ "tokio",
+ "tonic",
+ "tonic-build",
+ "tracing",
+]
+
+[[package]]
+name = "sp1-recursion-circuit"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebf27cc0c97c8280ac5fd475112cf48cd31503da0d56dc902ed46a7d95232b28"
+dependencies = [
+ "bincode",
+ "itertools 0.14.0",
+ "rand 0.8.5",
+ "rayon",
+ "serde",
+ "slop-air",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-basefold",
+ "slop-basefold-prover",
+ "slop-bn254",
+ "slop-challenger",
+ "slop-commit",
+ "slop-jagged",
+ "slop-koala-bear",
+ "slop-matrix",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-stacked",
+ "slop-sumcheck",
+ "slop-symmetric",
+ "slop-tensor",
+ "slop-whir",
+ "sp1-core-executor",
+ "sp1-core-machine",
+ "sp1-derive",
+ "sp1-hypercube",
+ "sp1-primitives",
+ "sp1-recursion-compiler",
+ "sp1-recursion-executor",
+ "sp1-recursion-machine",
+ "tracing",
+]
+
+[[package]]
+name = "sp1-recursion-compiler"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80c70432e7cc894a893a07d49d65149d99ad7deacb89502ec20f135c2f36ab7a"
+dependencies = [
+ "backtrace",
+ "cfg-if",
+ "itertools 0.14.0",
+ "serde",
+ "slop-algebra",
+ "slop-bn254",
+ "slop-symmetric",
+ "sp1-core-machine",
+ "sp1-hypercube",
+ "sp1-primitives",
+ "sp1-recursion-executor",
+ "tracing",
+ "vec_map",
+]
+
+[[package]]
+name = "sp1-recursion-executor"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07d09ed74240eddaaad86945602eda7c35ea90f969de9d7fd4faa9442ab7878c"
+dependencies = [
+ "backtrace",
+ "cfg-if",
+ "hashbrown 0.14.5",
+ "itertools 0.14.0",
+ "range-set-blaze",
+ "serde",
+ "slop-algebra",
+ "slop-maybe-rayon",
+ "slop-poseidon2",
+ "slop-symmetric",
+ "smallvec",
+ "sp1-derive",
+ "sp1-hypercube",
+ "static_assertions",
+ "thiserror 1.0.69",
+ "tracing",
+]
+
+[[package]]
+name = "sp1-recursion-gnark-ffi"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7b67bd9a9dcd038e68fa4a3272a78e2d3098df05250bb78857aa17fef411563"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "bindgen",
+ "cfg-if",
+ "hex",
+ "num-bigint 0.4.6",
+ "serde",
+ "serde_json",
+ "sha2",
+ "slop-algebra",
+ "slop-symmetric",
+ "sp1-hypercube",
+ "sp1-primitives",
+ "sp1-recursion-compiler",
+ "sp1-verifier",
+ "tempfile",
+ "tracing",
+]
+
+[[package]]
+name = "sp1-recursion-machine"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db4903ee3895f7cbffe8f5624e516debd2ef1a4db5b01f75ca1fe3e84b12f5a6"
+dependencies = [
+ "itertools 0.14.0",
+ "rand 0.8.5",
+ "slop-air",
+ "slop-algebra",
+ "slop-basefold",
+ "slop-matrix",
+ "slop-maybe-rayon",
+ "slop-symmetric",
+ "sp1-derive",
+ "sp1-hypercube",
+ "sp1-primitives",
+ "sp1-recursion-executor",
+ "strum",
+ "tracing",
+ "zkhash",
+]
+
+[[package]]
+name = "sp1-sdk"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "378f702c65ac9bea522fdde527f0260a95af155aa10d089e1e9e6ba660b60f50"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bincode",
+ "cfg-if",
+ "dirs",
+ "eventsource-stream",
+ "futures",
+ "hex",
+ "indicatif",
+ "itertools 0.14.0",
+ "k256",
+ "num-bigint 0.4.6",
+ "serde",
+ "sha2",
+ "slop-algebra",
+ "slop-alloc",
+ "slop-basefold",
+ "slop-commit",
+ "slop-jagged",
+ "slop-merkle-tree",
+ "slop-multilinear",
+ "slop-stacked",
+ "slop-sumcheck",
+ "slop-tensor",
+ "sp1-build",
+ "sp1-core-executor",
+ "sp1-core-machine",
+ "sp1-cuda",
+ "sp1-hypercube",
+ "sp1-primitives",
+ "sp1-prover",
+ "sp1-prover-types",
+ "sp1-recursion-executor",
+ "sp1-recursion-gnark-ffi",
+ "sp1-verifier",
+ "strum",
+ "tempfile",
+ "thiserror 1.0.69",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "sp1-verifier"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1942e85d450056725480ac900711869fe1ae453a4e069bcabff3ee7791773e62"
+dependencies = [
+ "bincode",
+ "blake3",
+ "cfg-if",
+ "dirs",
+ "hex",
+ "lazy_static",
+ "serde",
+ "sha2",
+ "slop-algebra",
+ "slop-challenger",
+ "slop-primitives",
+ "slop-symmetric",
+ "sp1-hypercube",
+ "sp1-primitives",
+ "sp1-recursion-executor",
+ "sp1-recursion-machine",
+ "strum",
+ "substrate-bn-succinct-rs",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "sp1-zkvm"
+version = "6.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdf86a2a275e6788a1b34d71bc607fa5d5452d0149a15d34f7945f005ad6e37"
+dependencies = [
+ "cfg-if",
+ "critical-section",
+ "embedded-alloc",
+ "getrandom 0.2.17",
+ "getrandom 0.3.4",
+ "lazy_static",
+ "libm",
+ "rand 0.8.5",
+ "sha2",
+ "sp1-lib",
+ "sp1-primitives",
+]
+
+[[package]]
+name = "spin"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der",
+]
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "strength_reduce"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "strum"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "subenum"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec3d08fe7078c57309d5c3d938e50eba95ba1d33b9c3a101a8465fc6861a5416"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "substrate-bn-succinct-rs"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a241fd7c1016fb8ad30fcf5a20986c0c4538e8f15a1b41a1761516299e377ec1"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+ "cfg-if",
+ "crunchy",
+ "lazy_static",
+ "num-bigint 0.4.6",
+ "rand 0.8.5",
+ "rustc-hex",
+]
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "svgbobdoc"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2c04b93fc15d79b39c63218f15e3fdffaa4c227830686e3b7c5f41244eb3e50"
+dependencies = [
+ "base64 0.13.1",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "unicode-width 0.1.14",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "sysinfo"
+version = "0.30.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3"
+dependencies = [
+ "cfg-if",
+ "core-foundation-sys",
+ "libc",
+ "ntapi",
+ "once_cell",
+ "rayon",
+ "windows",
+]
+
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
+[[package]]
+name = "tempfile"
+version = "3.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0"
+dependencies = [
+ "fastrand",
+ "getrandom 0.4.1",
+ "once_cell",
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "thousands"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820"
+
+[[package]]
+name = "thread_local"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "time"
+version = "0.3.47"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde_core",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
+
+[[package]]
+name = "time-macros"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.49.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86"
+dependencies = [
+ "bytes",
+ "libc",
+ "mio",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2 0.6.2",
+ "tokio-macros",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+
+[[package]]
+name = "toml_datetime"
+version = "0.7.5+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.19.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
+dependencies = [
+ "indexmap 2.13.0",
+ "toml_datetime 0.6.11",
+ "winnow 0.5.40",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.23.10+spec-1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269"
+dependencies = [
+ "indexmap 2.13.0",
+ "toml_datetime 0.7.5+spec-1.1.0",
+ "toml_parser",
+ "winnow 0.7.14",
+]
+
+[[package]]
+name = "toml_parser"
+version = "1.0.9+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4"
+dependencies = [
+ "winnow 0.7.14",
+]
+
+[[package]]
+name = "tonic"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64 0.22.1",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-timeout",
+ "hyper-util",
+ "percent-encoding",
+ "pin-project",
+ "prost",
+ "rustls-pemfile",
+ "socket2 0.5.10",
+ "tokio",
+ "tokio-rustls",
+ "tokio-stream",
+ "tower 0.4.13",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "prost-build",
+ "prost-types",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 1.9.3",
+ "pin-project",
+ "pin-project-lite",
+ "rand 0.8.5",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
+dependencies = [
+ "bitflags",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "iri-string",
+ "pin-project-lite",
+ "tower 0.5.3",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-appender"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "786d480bce6247ab75f005b14ae1624ad978d3029d9113f0a22fa1ac773faeaf"
+dependencies = [
+ "crossbeam-channel",
+ "thiserror 2.0.18",
+ "time",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-forest"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee40835db14ddd1e3ba414292272eddde9dad04d3d4b65509656414d1c42592f"
+dependencies = [
+ "ansi_term",
+ "smallvec",
+ "thiserror 1.0.69",
+ "tracing",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex-automata",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+]
+
+[[package]]
+name = "transpose"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
+dependencies = [
+ "num-integer",
+ "strength_reduce",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "typeid_prefix"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9da1387307fdee46aa441e4f08a1b491e659fcac1aca9cd71f2c624a0de5d1b"
+
+[[package]]
+name = "typeid_suffix"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77b55e96f110c6db5d1a2f24072552537f0091dc90cebeaa679540bac93e7405"
+dependencies = [
+ "uuid",
+]
+
+[[package]]
+name = "typenum"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
+
+[[package]]
+name = "unicode-width"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "url"
+version = "2.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "uuid"
+version = "1.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb"
+dependencies = [
+ "atomic",
+ "getrandom 0.4.1",
+ "js-sys",
+ "md-5",
+ "sha1_smol",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
+[[package]]
+name = "vec_map"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.2+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.113"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.63"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "js-sys",
+ "once_cell",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.113"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.113"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.113"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.13.0",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-streams"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap 2.13.0",
+ "semver",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.90"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
+dependencies = [
+ "windows-core 0.52.0",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.62.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
+dependencies = [
+ "windows-implement",
+ "windows-interface",
+ "windows-link",
+ "windows-result",
+ "windows-strings",
+]
+
+[[package]]
+name = "windows-implement"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "windows-interface"
+version = "0.59.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-result"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
+dependencies = [
+ "windows-targets 0.53.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm 0.52.6",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.53.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
+dependencies = [
+ "windows-link",
+ "windows_aarch64_gnullvm 0.53.1",
+ "windows_aarch64_msvc 0.53.1",
+ "windows_i686_gnu 0.53.1",
+ "windows_i686_gnullvm 0.53.1",
+ "windows_i686_msvc 0.53.1",
+ "windows_x86_64_gnu 0.53.1",
+ "windows_x86_64_gnullvm 0.53.1",
+ "windows_x86_64_msvc 0.53.1",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
+
+[[package]]
+name = "winnow"
+version = "0.5.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "winnow"
+version = "0.7.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap 2.13.0",
+ "prettyplease",
+ "syn 2.0.117",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap 2.13.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap 2.13.0",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
+[[package]]
+name = "writeable"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
+
+[[package]]
+name = "wyz"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+dependencies = [
+ "tap",
+]
+
+[[package]]
+name = "yoke"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zerofrom"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+dependencies = [
+ "zeroize_derive",
+]
+
+[[package]]
+name = "zeroize_derive"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zerotrie"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zkhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4352d1081da6922701401cdd4cbf29a2723feb4cfabb5771f6fee8e9276da1c7"
+dependencies = [
+ "ark-ff",
+ "ark-std",
+ "bitvec",
+ "blake2",
+ "bls12_381",
+ "byteorder",
+ "cfg-if",
+ "group 0.12.1",
+ "group 0.13.0",
+ "halo2",
+ "hex",
+ "jubjub",
+ "lazy_static",
+ "pasta_curves 0.5.1",
+ "rand 0.8.5",
+ "serde",
+ "sha2",
+ "sha3",
+ "subtle",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
diff --git a/bench_vs/sp1/fibonacci/Cargo.toml b/bench_vs/sp1/fibonacci/Cargo.toml
new file mode 100644
index 000000000..fc24039c2
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/Cargo.toml
@@ -0,0 +1,3 @@
+[workspace]
+members = ["program", "script"]
+resolver = "2"
diff --git a/bench_vs/sp1/fibonacci/program/Cargo.toml b/bench_vs/sp1/fibonacci/program/Cargo.toml
new file mode 100644
index 000000000..551be48b5
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/program/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "fibonacci-program"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+sp1-zkvm = "6.0.1"
diff --git a/bench_vs/sp1/fibonacci/program/src/main.rs b/bench_vs/sp1/fibonacci/program/src/main.rs
new file mode 100644
index 000000000..571e157b3
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/program/src/main.rs
@@ -0,0 +1,14 @@
+#![no_main]
+sp1_zkvm::entrypoint!(main);
+
+pub fn main() {
+    let n: u64 = sp1_zkvm::io::read::<u64>();
+    let mut a: u64 = 0;
+    let mut b: u64 = 1;
+    for _ in 0..n {
+        let c = a.wrapping_add(b);
+        a = b;
+        b = c;
+    }
+    sp1_zkvm::io::commit(&b);
+}
diff --git a/bench_vs/sp1/fibonacci/rust-toolchain b/bench_vs/sp1/fibonacci/rust-toolchain
new file mode 100644
index 000000000..9397b9526
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/rust-toolchain
@@ -0,0 +1,3 @@
+[toolchain]
+channel = "stable"
+components = ["llvm-tools", "rustc-dev"]
diff --git a/bench_vs/sp1/fibonacci/script/Cargo.toml b/bench_vs/sp1/fibonacci/script/Cargo.toml
new file mode 100644
index 000000000..b72b33517
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/script/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "fibonacci-script"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+sp1-sdk = { version = "6.0.1", features = ["blocking"] }
+
+[build-dependencies]
+sp1-build = "6.0.1"
diff --git a/bench_vs/sp1/fibonacci/script/build.rs b/bench_vs/sp1/fibonacci/script/build.rs
new file mode 100644
index 000000000..d6cf925d6
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/script/build.rs
@@ -0,0 +1,5 @@
+use sp1_build::build_program_with_args;
+
+fn main() {
+    build_program_with_args("../program", Default::default());
+}
diff --git a/bench_vs/sp1/fibonacci/script/src/main.rs b/bench_vs/sp1/fibonacci/script/src/main.rs
new file mode 100644
index 000000000..761d0c911
--- /dev/null
+++ b/bench_vs/sp1/fibonacci/script/src/main.rs
@@ -0,0 +1,47 @@
+use sp1_sdk::blocking::{ProveRequest, Prover, ProverClient};
+use sp1_sdk::{include_elf, ProvingKey, SP1Stdin};
+use std::time::Instant;
+
+const FIB_ELF: sp1_sdk::Elf = include_elf!("fibonacci-program");
+
+fn main() {
+    sp1_sdk::utils::setup_logger();
+
+    let n: u64 = std::env::args()
+        .nth(1)
+        .expect("Usage: fibonacci-script <n>")
+        .parse()
+        .expect("n must be a u64");
+
+    let client = ProverClient::from_env();
+    let mut stdin = SP1Stdin::new();
+    stdin.write(&n);
+
+    // Setup
+    let pk = client.setup(FIB_ELF.clone()).expect("setup failed");
+
+    // Execute for cycle count
+    let (_, report) = client
+        .execute(FIB_ELF.clone(), stdin.clone())
+        .run()
+        .unwrap();
+    println!("Cycles: {}", report.total_instruction_count());
+
+    // Core proof (no recursion)
+    let start = Instant::now();
+    let proof = client
+        .prove(&pk, stdin)
+        .core()
+        .run()
+        .expect("prove failed");
+    let elapsed = start.elapsed();
+
+    println!("Proving time: {:.3}s", elapsed.as_secs_f64());
+
+    // Verify
+    client
+        .verify(&proof, pk.verifying_key(), None)
+        .expect("verify failed");
+
+    println!("Proof verified successfully");
+}

From 472ddc394a5a9cd5346fd435bdb69add8574f4c6 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Wed, 25 Feb 2026 19:22:16 -0300
Subject: [PATCH 02/34] Standarize guest

---
 bench_vs/README.md |  4 ++--
 bench_vs/run.sh    | 36 ++++++++++--------------------------
 2 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/bench_vs/README.md b/bench_vs/README.md
index 0a20304c3..1be30c5d2 100644
--- a/bench_vs/README.md
+++ b/bench_vs/README.md
@@ -15,9 +15,9 @@ Compares proving time for an identical u64 wrapping Fibonacci computation.
    sp1up
    ```
 
-3. **RISC-V assembler** — Homebrew clang + ld.lld (macOS):
+3. **Rust nightly** (for cross-compiling Lambda VM guest):
    ```bash
-   brew install llvm
+   rustup toolchain install nightly
    ```
 
 ## Usage
diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index 1575e62a3..95b84d4d6 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -9,8 +9,7 @@
 # Prerequisites:
 #   - Lambda VM CLI built: cargo build --release -p cli
 #   - SP1 toolchain installed: curl -L https://sp1up.succinct.xyz | bash && sp1up
-#   - clang with RISC-V target support (macOS Homebrew clang works)
-#   - ld.lld linker
+#   - Rust nightly toolchain: rustup toolchain install nightly
 
 set -euo pipefail
 
@@ -65,6 +64,9 @@ rm -rf "$TMP_DIR" && mkdir -p "$TMP_DIR"
 # --- Pre-build ---------------------------------------------------------------
 
 CLI="$ROOT_DIR/target/release/cli"
+LAMBDA_DIR="$SCRIPT_DIR/lambda/fibonacci"
+TARGET_SPEC="$ROOT_DIR/executor/programs/riscv64im-lambda-vm-elf.json"
+
 if $RUN_LAMBDA && [ ! -f "$CLI" ]; then
     echo -e "${YELLOW}[Lambda VM] CLI not found, building...${NC}"
     cargo build --release -p cli --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1
@@ -97,32 +99,14 @@ run_one() {
     local sp1_cycles=""
 
     if $RUN_LAMBDA; then
-        # Generate assembly
-        cat > "$TMP_DIR/fib.s" <<ASM
-	.attribute	5, "rv64i2p1_m2p0"
-	.globl	main
-main:
-	li	t0, 0
-	li	t1, 1
-	li	a0, ${N}
-
-.loop:
-	add	t2, t0, t1
-	mv	t0, t1
-	mv	t1, t2
-	addi	a0, a0, -1
-	bnez	a0, .loop
-
-	mv	a0, t1
-	li	a7, 5
-	ecall
-ASM
-        clang --target=riscv64 -march=rv64im -mabi=lp64 -nostdlib \
-            -c "$TMP_DIR/fib.s" -o "$TMP_DIR/fib.o"
-        ld.lld -o "$TMP_DIR/fib.elf" "$TMP_DIR/fib.o" --entry=main
+        echo -e "  ${GREEN}[Lambda VM] Building (n=${N})...${NC}"
+        (cd "$LAMBDA_DIR" && BENCH_N="$N" cargo +nightly build --release \
+            --target "$TARGET_SPEC" \
+            -Z build-std=core -Z build-std-features=compiler-builtins-mem 2>&1 | tail -1)
+        LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench"
 
         echo -e "  ${GREEN}[Lambda VM] Proving...${NC}"
-        LAMBDA_OUTPUT=$("$CLI" prove "$TMP_DIR/fib.elf" -o "$TMP_DIR/lambda_proof.bin" --time 2>/dev/null)
+        LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>/dev/null)
         lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
         echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC}"
     fi

From 88171d01081e8d9ebde34497456d9ddcb4bc5b40 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Wed, 25 Feb 2026 19:35:36 -0300
Subject: [PATCH 03/34] Fix decimals

---
 bench_vs/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index 95b84d4d6..ed0b519df 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -156,7 +156,7 @@ for i in "${!RESULT_N[@]}"; do
 
     if $RUN_LAMBDA && $RUN_SP1; then
         if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then
-            RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $st / $lt}")
+            RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.2fx\", $st / $lt}")
             if (( $(LC_NUMERIC=C awk "BEGIN {print ($st > $lt)}") )); then
                 RATIO="${GREEN}${RATIO}${NC}"
             else

From ef7d5bb6a62990733dd2959689803f64efcd1762 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Wed, 25 Feb 2026 19:36:24 -0300
Subject: [PATCH 04/34] Benchmark vs sp1

---
 bench_vs/run.sh                         |  8 +--
 bench_vs/sp1/fibonacci/.tldr/daemon.pid |  1 -
 bench_vs/sp1/fibonacci/.tldr/status     |  1 -
 bench_vs/sp1/fibonacci/.tldrignore      | 84 -------------------------
 4 files changed, 4 insertions(+), 90 deletions(-)
 delete mode 100644 bench_vs/sp1/fibonacci/.tldr/daemon.pid
 delete mode 100644 bench_vs/sp1/fibonacci/.tldr/status
 delete mode 100644 bench_vs/sp1/fibonacci/.tldrignore

diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index ed0b519df..113666124 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -156,11 +156,11 @@ for i in "${!RESULT_N[@]}"; do
 
     if $RUN_LAMBDA && $RUN_SP1; then
         if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then
-            RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.2fx\", $st / $lt}")
-            if (( $(LC_NUMERIC=C awk "BEGIN {print ($st > $lt)}") )); then
-                RATIO="${GREEN}${RATIO}${NC}"
-            else
+            RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $lt / $st}")
+            if (( $(LC_NUMERIC=C awk "BEGIN {print ($lt > $st)}") )); then
                 RATIO="${RED}${RATIO}${NC}"
+            else
+                RATIO="${GREEN}${RATIO}${NC}"
             fi
             printf "  %-10s  %11ss  %11ss  " "$n" "$lt" "$st"
             echo -e "$RATIO"
diff --git a/bench_vs/sp1/fibonacci/.tldr/daemon.pid b/bench_vs/sp1/fibonacci/.tldr/daemon.pid
deleted file mode 100644
index 10eda36c4..000000000
--- a/bench_vs/sp1/fibonacci/.tldr/daemon.pid
+++ /dev/null
@@ -1 +0,0 @@
-39495
\ No newline at end of file
diff --git a/bench_vs/sp1/fibonacci/.tldr/status b/bench_vs/sp1/fibonacci/.tldr/status
deleted file mode 100644
index ad50b5340..000000000
--- a/bench_vs/sp1/fibonacci/.tldr/status
+++ /dev/null
@@ -1 +0,0 @@
-ready
\ No newline at end of file
diff --git a/bench_vs/sp1/fibonacci/.tldrignore b/bench_vs/sp1/fibonacci/.tldrignore
deleted file mode 100644
index e01df83cb..000000000
--- a/bench_vs/sp1/fibonacci/.tldrignore
+++ /dev/null
@@ -1,84 +0,0 @@
-# TLDR ignore patterns (gitignore syntax)
-# Auto-generated - review and customize for your project
-# Docs: https://git-scm.com/docs/gitignore
-
-# ===================
-# Dependencies
-# ===================
-node_modules/
-.venv/
-venv/
-env/
-__pycache__/
-.tox/
-.nox/
-.pytest_cache/
-.mypy_cache/
-.ruff_cache/
-vendor/
-Pods/
-
-# ===================
-# Build outputs
-# ===================
-dist/
-build/
-out/
-target/
-*.egg-info/
-*.whl
-*.pyc
-*.pyo
-
-# ===================
-# Binary/large files
-# ===================
-*.so
-*.dylib
-*.dll
-*.exe
-*.bin
-*.o
-*.a
-*.lib
-
-# ===================
-# IDE/editors
-# ===================
-.idea/
-.vscode/
-*.swp
-*.swo
-*~
-
-# ===================
-# Security (always exclude)
-# ===================
-.env
-.env.*
-*.pem
-*.key
-*.p12
-*.pfx
-credentials.*
-secrets.*
-
-# ===================
-# Version control
-# ===================
-.git/
-.hg/
-.svn/
-
-# ===================
-# OS files
-# ===================
-.DS_Store
-Thumbs.db
-
-# ===================
-# Project-specific
-# Add your custom patterns below
-# ===================
-# large_test_fixtures/
-# data/

From e11346a084bcfe9781a184679420a3161a9ab39e Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Wed, 25 Feb 2026 20:19:15 -0300
Subject: [PATCH 05/34] Add lambda program

---
 bench_vs/lambda/fibonacci/.cargo/config.toml |  6 ++++
 bench_vs/lambda/fibonacci/Cargo.lock         |  7 ++++
 bench_vs/lambda/fibonacci/Cargo.toml         |  8 +++++
 bench_vs/lambda/fibonacci/build.rs           | 10 ++++++
 bench_vs/lambda/fibonacci/src/main.rs        | 35 ++++++++++++++++++++
 5 files changed, 66 insertions(+)
 create mode 100644 bench_vs/lambda/fibonacci/.cargo/config.toml
 create mode 100644 bench_vs/lambda/fibonacci/Cargo.lock
 create mode 100644 bench_vs/lambda/fibonacci/Cargo.toml
 create mode 100644 bench_vs/lambda/fibonacci/build.rs
 create mode 100644 bench_vs/lambda/fibonacci/src/main.rs

diff --git a/bench_vs/lambda/fibonacci/.cargo/config.toml b/bench_vs/lambda/fibonacci/.cargo/config.toml
new file mode 100644
index 000000000..be730c3ec
--- /dev/null
+++ b/bench_vs/lambda/fibonacci/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.riscv64im-lambda-vm-elf]
+rustflags = [
+  "-C", "link-arg=-e",
+  "-C", "link-arg=main",
+  "-C", "passes=lower-atomic"
+]
diff --git a/bench_vs/lambda/fibonacci/Cargo.lock b/bench_vs/lambda/fibonacci/Cargo.lock
new file mode 100644
index 000000000..3a4bb7634
--- /dev/null
+++ b/bench_vs/lambda/fibonacci/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "fibonacci-bench"
+version = "0.1.0"
diff --git a/bench_vs/lambda/fibonacci/Cargo.toml b/bench_vs/lambda/fibonacci/Cargo.toml
new file mode 100644
index 000000000..8ce06fec5
--- /dev/null
+++ b/bench_vs/lambda/fibonacci/Cargo.toml
@@ -0,0 +1,8 @@
+[workspace]
+
+[package]
+name = "fibonacci-bench"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
diff --git a/bench_vs/lambda/fibonacci/build.rs b/bench_vs/lambda/fibonacci/build.rs
new file mode 100644
index 000000000..5c189eadb
--- /dev/null
+++ b/bench_vs/lambda/fibonacci/build.rs
@@ -0,0 +1,10 @@
+use std::env;
+use std::fs;
+use std::path::Path;
+
+fn main() {
+    let n = env::var("BENCH_N").unwrap_or_else(|_| "1000".to_string());
+    let out_dir = env::var("OUT_DIR").unwrap();
+    fs::write(Path::new(&out_dir).join("n.txt"), &n).unwrap();
+    println!("cargo:rerun-if-env-changed=BENCH_N");
+}
diff --git a/bench_vs/lambda/fibonacci/src/main.rs b/bench_vs/lambda/fibonacci/src/main.rs
new file mode 100644
index 000000000..8f54cf604
--- /dev/null
+++ b/bench_vs/lambda/fibonacci/src/main.rs
@@ -0,0 +1,35 @@
+#![no_std]
+#![no_main]
+
+use core::panic::PanicInfo;
+
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    loop {}
+}
+
+const N: u64 = include!(concat!(env!("OUT_DIR"), "/n.txt"));
+
+#[inline(never)]
+fn halt(code: u64) -> ! {
+    unsafe {
+        core::arch::asm!(
+            "ecall",
+            in("a0") code,
+            in("a7") 5u64,
+            options(noreturn),
+        );
+    }
+}
+
+#[unsafe(no_mangle)]
+pub fn main() -> ! {
+    let mut a: u64 = 0;
+    let mut b: u64 = 1;
+    for _ in 0..N {
+        let c = a.wrapping_add(b);
+        a = b;
+        b = c;
+    }
+    halt(b)
+}

From 687d8d87009deac5cb3b5a7823742f916efd108e Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Wed, 25 Feb 2026 20:20:41 -0300
Subject: [PATCH 06/34] Remove stray blank line from .gitignore

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 3ef9f8283..9c826f0d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,3 @@ executor/program_artifacts/
 # Shared cargo target directory for ELF builds
 executor/shared_target/
 
-

From 00696591cf985633c3f939e5c06a7df753ad690a Mon Sep 17 00:00:00 2001
From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com>
Date: Mon, 2 Mar 2026 11:12:21 -0300
Subject: [PATCH 07/34] Apply suggestion from @gabrielbosio

Co-authored-by: Gabriel Bosio <38794644+gabrielbosio@users.noreply.github.com>
---
 bench_vs/run.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index 113666124..4aa249a5d 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -102,7 +102,8 @@ run_one() {
         echo -e "  ${GREEN}[Lambda VM] Building (n=${N})...${NC}"
         (cd "$LAMBDA_DIR" && BENCH_N="$N" cargo +nightly build --release \
             --target "$TARGET_SPEC" \
-            -Z build-std=core -Z build-std-features=compiler-builtins-mem 2>&1 | tail -1)
+            -Z build-std=core -Z build-std-features=compiler-builtins-mem 2>&1 \
+            -Z json-target-spec 2>&1 | tail -1)
         LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench"
 
         echo -e "  ${GREEN}[Lambda VM] Proving...${NC}"

From dabe54c5faf9ae4ddc58d492281b6709b4136571 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Mon, 2 Mar 2026 16:57:38 -0300
Subject: [PATCH 08/34] Add instruments

---
 Cargo.toml                                |   5 +
 bin/cli/Cargo.toml                        |   1 +
 crypto/stark/src/constraints/evaluator.rs |  36 ----
 crypto/stark/src/instruments.rs           | 127 ++++++++++++++
 crypto/stark/src/lib.rs                   |   2 +
 crypto/stark/src/prover.rs                | 181 +++++++++++++++----
 prover/Cargo.toml                         |   5 +
 prover/benches/profile_vm_prover.rs       |  10 +-
 prover/src/instruments.rs                 | 204 ++++++++++++++++++++++
 prover/src/lib.rs                         |  43 +++++
 10 files changed, 538 insertions(+), 76 deletions(-)
 create mode 100644 crypto/stark/src/instruments.rs
 create mode 100644 prover/src/instruments.rs

diff --git a/Cargo.toml b/Cargo.toml
index 577ab04c4..e24fd3bfc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,3 +15,8 @@ resolver = "2"
 [profile.dev]
 opt-level = 3
 debug = true
+
+# debug=1 = line tables only: enables function names in profilers (samply, perf)
+# without slowing compilation or bloating the binary significantly.
+[profile.release]
+debug = 1
diff --git a/bin/cli/Cargo.toml b/bin/cli/Cargo.toml
index 8eb62c86f..602e58f5f 100644
--- a/bin/cli/Cargo.toml
+++ b/bin/cli/Cargo.toml
@@ -15,3 +15,4 @@ tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true }
 
 [features]
 jemalloc-stats = ["dep:tikv-jemalloc-ctl"]
+instruments = ["prover/instruments"]
diff --git a/crypto/stark/src/constraints/evaluator.rs b/crypto/stark/src/constraints/evaluator.rs
index 908d7b950..2c46e334b 100644
--- a/crypto/stark/src/constraints/evaluator.rs
+++ b/crypto/stark/src/constraints/evaluator.rs
@@ -18,8 +18,6 @@ use rayon::{
 };
 
 use std::marker::PhantomData;
-#[cfg(feature = "instruments")]
-use std::time::Instant;
 
 pub struct ConstraintEvaluator<
     Field: IsSubFieldOf<FieldExtension> + IsFFTField + Send + Sync,
@@ -226,9 +224,6 @@ where
         #[cfg(all(debug_assertions, not(feature = "parallel")))]
         let boundary_polys: Vec<Polynomial<FieldElement<Field>>> = Vec::new();
 
-        #[cfg(feature = "instruments")]
-        let timer = Instant::now();
-
         let trace_length = domain.interpolation_domain_size;
         let lde_periodic_columns = air
             .get_periodic_column_polynomials(trace_length)
@@ -244,15 +239,6 @@ where
             .collect::<Result<Vec<Vec<FieldElement<Field>>>, FFTError>>()
             .unwrap();
 
-        #[cfg(feature = "instruments")]
-        println!(
-            "     Evaluating periodic columns on lde: {:#?}",
-            timer.elapsed()
-        );
-
-        #[cfg(feature = "instruments")]
-        let timer = Instant::now();
-
         // Fused boundary evaluation: compute (trace[col] - value) on-the-fly
         // instead of pre-computing all boundary_polys_evaluations.
         // This eliminates N_constraints × LDE_size intermediate allocations.
@@ -282,12 +268,6 @@ where
             })
             .collect();
 
-        #[cfg(feature = "instruments")]
-        println!(
-            "     Evaluated boundary polynomials on LDE: {:#?}",
-            timer.elapsed()
-        );
-
         #[cfg(all(debug_assertions, not(feature = "parallel")))]
         let boundary_zerofiers = Vec::new();
 
@@ -297,22 +277,12 @@ where
         #[cfg(all(debug_assertions, not(feature = "parallel")))]
         let _transition_evaluations: Vec<FieldElement<FieldExtension>> = Vec::new();
 
-        #[cfg(feature = "instruments")]
-        let timer = Instant::now();
         let zerofier_data = air.transition_zerofier_evaluations_grouped(domain);
-        #[cfg(feature = "instruments")]
-        println!(
-            "     Evaluated transition zerofiers: {:#?}",
-            timer.elapsed()
-        );
 
         // Iterate over all LDE domain and compute the part of the composition polynomial
         // related to the transition constraints and add it to the already computed part of the
         // boundary constraints.
 
-        #[cfg(feature = "instruments")]
-        let timer = Instant::now();
-
         let num_transition = air.num_transition_constraints();
         let num_periodic = lde_periodic_columns.len();
         let offsets = &air.context().transition_offsets;
@@ -330,12 +300,6 @@ where
             offsets,
         );
 
-        #[cfg(feature = "instruments")]
-        println!(
-            "     Evaluated transitions and accumulated results: {:#?}",
-            timer.elapsed()
-        );
-
         evaluations_t
     }
 }
diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs
new file mode 100644
index 000000000..11ac350af
--- /dev/null
+++ b/crypto/stark/src/instruments.rs
@@ -0,0 +1,127 @@
+use std::cell::RefCell;
+use std::time::Duration;
+
+/// Sub-operation timing breakdown for a single table in Rounds 2-4.
+#[derive(Clone, Debug, Default)]
+pub struct TableSubOps {
+    /// reconstruct_round1 (expand_pool_to_lde)
+    pub trace_lde: Duration,
+    /// evaluator.evaluate()
+    pub constraints: Duration,
+    /// decompose_and_extend_d2
+    pub comp_decompose: Duration,
+    /// commit_composition_polynomial
+    pub comp_commit: Duration,
+    /// Round 3: barycentric OOD evaluation
+    pub ood: Duration,
+    /// Round 4: compute_deep_composition_poly_evaluations
+    pub deep_comp: Duration,
+    /// Round 4: interpolate_fft + evaluate_fft
+    pub deep_extend: Duration,
+    /// fri::commit_phase_from_evaluations
+    pub fri_commit: Duration,
+    /// Round 4: grinding + FRI query + Merkle openings
+    pub queries: Duration,
+}
+
+/// Sub-operation breakdown for Round 1 aux commit pass.
+#[derive(Clone, Debug, Default)]
+pub struct Round1SubOps {
+    /// Main trace: expand_pool_to_lde (LDE/FFT)
+    pub main_lde: Duration,
+    /// Main trace: commit_columns_bit_reversed (Merkle)
+    pub main_merkle: Duration,
+    /// Aux trace: expand_pool_to_lde (LDE/FFT)
+    pub aux_lde: Duration,
+    /// Aux trace: commit_columns_bit_reversed (Merkle)
+    pub aux_merkle: Duration,
+}
+
+/// Timing data collected inside `multi_prove`.
+pub struct MultiProveTiming {
+    pub prepass: Duration,
+    pub main_commits: Duration,
+    pub aux_build: Duration,
+    pub aux_commit: Duration,
+    pub rounds_2_4: Duration,
+    /// Sub-op breakdown for Round 1 (main + aux LDE vs Merkle).
+    pub round1_sub: Round1SubOps,
+    /// (name, rows, duration, sub_ops) per table for rounds 2-4.
+    pub table_timings: Vec<(String, usize, Duration, TableSubOps)>,
+}
+
+thread_local! {
+    static TIMING_DATA: RefCell<Option<MultiProveTiming>> = const { RefCell::new(None) };
+    /// Round 1 sub-timings accumulated across the main-commit and aux-commit loops.
+    static R1_SUB: RefCell<Round1SubOps> = const { RefCell::new(Round1SubOps {
+        main_lde: Duration::ZERO, main_merkle: Duration::ZERO,
+        aux_lde: Duration::ZERO, aux_merkle: Duration::ZERO,
+    }) };
+    /// Round 2 sub-timings: (constraints, fft, merkle)
+    static R2_SUB: RefCell<Option<(Duration, Duration, Duration)>> = const { RefCell::new(None) };
+    /// Round 4 sub-timings: (fft, merkle, deep_comp, queries)
+    static R4_SUB: RefCell<Option<(Duration, Duration, Duration, Duration)>> = const { RefCell::new(None) };
+    /// Assembled sub-ops from prove_rounds_2_to_4 (without reconstruct_round1 LDE time).
+    static ROUND_SUB_OPS: RefCell<Option<TableSubOps>> = const { RefCell::new(None) };
+}
+
+pub fn store(data: MultiProveTiming) {
+    TIMING_DATA.with(|cell| {
+        *cell.borrow_mut() = Some(data);
+    });
+}
+
+pub fn take() -> Option<MultiProveTiming> {
+    TIMING_DATA.with(|cell| cell.borrow_mut().take())
+}
+
+pub fn accum_r1_main(lde: Duration, merkle: Duration) {
+    R1_SUB.with(|cell| {
+        let mut s = cell.borrow_mut();
+        s.main_lde += lde;
+        s.main_merkle += merkle;
+    });
+}
+
+pub fn accum_r1_aux(lde: Duration, merkle: Duration) {
+    R1_SUB.with(|cell| {
+        let mut s = cell.borrow_mut();
+        s.aux_lde += lde;
+        s.aux_merkle += merkle;
+    });
+}
+
+pub fn take_r1_sub() -> Round1SubOps {
+    R1_SUB.with(|cell| {
+        std::mem::replace(
+            &mut *cell.borrow_mut(),
+            Round1SubOps::default(),
+        )
+    })
+}
+
+pub fn store_r2_sub(constraints: Duration, fft: Duration, merkle: Duration) {
+    R2_SUB.with(|cell| *cell.borrow_mut() = Some((constraints, fft, merkle)));
+}
+
+pub fn take_r2_sub() -> Option<(Duration, Duration, Duration)> {
+    R2_SUB.with(|cell| cell.borrow_mut().take())
+}
+
+pub fn store_r4_sub(fft: Duration, merkle: Duration, deep_comp: Duration, queries: Duration) {
+    R4_SUB.with(|cell| *cell.borrow_mut() = Some((fft, merkle, deep_comp, queries)));
+}
+
+pub fn take_r4_sub() -> Option<(Duration, Duration, Duration, Duration)> {
+    R4_SUB.with(|cell| cell.borrow_mut().take())
+}
+
+pub fn store_round_sub_ops(data: TableSubOps) {
+    ROUND_SUB_OPS.with(|cell| {
+        *cell.borrow_mut() = Some(data);
+    });
+}
+
+pub fn take_round_sub_ops() -> Option<TableSubOps> {
+    ROUND_SUB_OPS.with(|cell| cell.borrow_mut().take())
+}
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index d8f293589..0415572af 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -1,6 +1,8 @@
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug;
 pub mod constraints;
+#[cfg(feature = "instruments")]
+pub mod instruments;
 pub mod context;
 pub mod debug;
 pub mod domain;
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 085bcb03d..c0223d67a 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -445,10 +445,18 @@ pub trait IsStarkProver<
     {
         let num_cols = trace.num_main_columns;
         trace.extract_columns_main_into(main_pool);
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         Self::expand_pool_to_lde::<Field>(main_pool, num_cols, domain, twiddles);
+        #[cfg(feature = "instruments")]
+        let main_lde_dur = t_sub.elapsed();
 
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let (tree, root) = Self::commit_columns_bit_reversed(&main_pool[..num_cols])
             .ok_or(ProvingError::EmptyCommitment)?;
+        #[cfg(feature = "instruments")]
+        crate::instruments::accum_r1_main(main_lde_dur, t_sub.elapsed());
 
         transcript.append_bytes(&root);
 
@@ -483,8 +491,14 @@ pub trait IsStarkProver<
     {
         let num_cols = trace.num_main_columns;
         trace.extract_columns_main_into(main_pool);
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         Self::expand_pool_to_lde::<Field>(main_pool, num_cols, domain, twiddles);
+        #[cfg(feature = "instruments")]
+        let main_lde_dur = t_sub.elapsed();
 
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let (precomputed_tree, precomputed_root) =
             Self::commit_columns_bit_reversed(&main_pool[..num_precomputed_cols])
                 .ok_or(ProvingError::EmptyCommitment)?;
@@ -492,6 +506,8 @@ pub trait IsStarkProver<
         let (mult_tree, mult_root) =
             Self::commit_columns_bit_reversed(&main_pool[num_precomputed_cols..num_cols])
                 .ok_or(ProvingError::EmptyCommitment)?;
+        #[cfg(feature = "instruments")]
+        crate::instruments::accum_r1_main(main_lde_dur, t_sub.elapsed());
 
         debug_assert_eq!(
             precomputed_root, precomputed_commitment,
@@ -806,6 +822,8 @@ pub trait IsStarkProver<
             round_1_result.bus_public_inputs.as_ref(),
             trace_length,
         );
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let constraint_evaluations = evaluator.evaluate(
             air,
             &round_1_result.lde_trace,
@@ -814,9 +832,13 @@ pub trait IsStarkProver<
             boundary_coefficients,
             &round_1_result.rap_challenges,
         );
+        #[cfg(feature = "instruments")]
+        let constraints_dur = t_sub.elapsed();
 
         let number_of_parts = air.composition_poly_degree_bound(trace_length) / trace_length;
 
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let lde_composition_poly_parts_evaluations = if number_of_parts == 2 {
             // Direct quotient decomposition: avoid full-size iFFT by algebraically
             // splitting H(x) = H₀(x²) + x·H₁(x²) using:
@@ -846,12 +868,21 @@ pub trait IsStarkProver<
                 })
                 .collect()
         };
+        #[cfg(feature = "instruments")]
+        let fft_dur = t_sub.elapsed();
 
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let Some((composition_poly_merkle_tree, composition_poly_root)) =
             Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
         else {
             return Err(ProvingError::EmptyCommitment);
         };
+        #[cfg(feature = "instruments")]
+        let merkle_dur = t_sub.elapsed();
+
+        #[cfg(feature = "instruments")]
+        crate::instruments::store_r2_sub(constraints_dur, fft_dur, merkle_dur);
 
         Ok(Round2 {
             lde_composition_poly_evaluations: lde_composition_poly_parts_evaluations,
@@ -974,6 +1005,8 @@ pub trait IsStarkProver<
         let gammas = deep_composition_coefficients;
 
         // Compute p₀ (deep composition polynomial) as N evaluations on trace-size coset
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let deep_evals = Self::compute_deep_composition_poly_evaluations(
             &round_1_result.lde_trace,
             round_2_result,
@@ -984,18 +1017,26 @@ pub trait IsStarkProver<
             &gammas,
             &trace_term_coeffs,
         );
+        #[cfg(feature = "instruments")]
+        let other_dur_1 = t_sub.elapsed();
 
         // Extend N trace-coset evaluations to 2N LDE-coset evaluations via standard LDE.
         // deep_evals[i] = h(offset·ω_N^i) = f(ω_N^i) where f(x) = h(offset·x).
         // Standard iFFT+FFT recovers f and evaluates on the 2N-th roots: f(Ω^j) = h(offset·Ω^j).
         let domain_size = domain.lde_roots_of_unity_coset.len();
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let deep_poly =
             Polynomial::interpolate_fft::<Field>(&deep_evals).expect("iFFT should succeed");
         let mut lde_evals = Polynomial::evaluate_fft::<Field>(&deep_poly, 1, Some(domain_size))
             .expect("FFT should succeed");
         in_place_bit_reverse_permute(&mut lde_evals);
+        #[cfg(feature = "instruments")]
+        let r4_fft_dur = t_sub.elapsed();
 
         // FRI commit phase from pre-computed evaluations (no initial FFT)
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let (fri_last_value, fri_layers) =
             fri::commit_phase_from_evaluations::<Field, FieldExtension>(
                 domain.root_order as usize,
@@ -1004,8 +1045,12 @@ pub trait IsStarkProver<
                 &coset_offset,
                 domain_size,
             );
+        #[cfg(feature = "instruments")]
+        let r4_merkle_dur = t_sub.elapsed();
 
         // grinding: generate nonce and append it to the transcript
+        #[cfg(feature = "instruments")]
+        let t_sub = Instant::now();
         let security_bits = air.context().proof_options.grinding_factor;
         let mut nonce = None;
         if security_bits > 0 {
@@ -1028,6 +1073,12 @@ pub trait IsStarkProver<
         let deep_poly_openings =
             Self::open_deep_composition_poly(domain, round_1_result, round_2_result, &iotas);
 
+        #[cfg(feature = "instruments")]
+        {
+            let queries_dur = t_sub.elapsed();
+            crate::instruments::store_r4_sub(r4_fft_dur, r4_merkle_dur, other_dur_1, queries_dur);
+        }
+
         Round4 {
             fri_last_value,
             fri_layers_merkle_roots,
@@ -1408,6 +1459,9 @@ pub trait IsStarkProver<
         // Pre-pass: compute domains, twiddles, and max dimensions for pool allocation
         // =====================================================================
 
+        #[cfg(feature = "instruments")]
+        let phase_start = Instant::now();
+
         let mut domains = Vec::with_capacity(num_airs);
         let mut twiddle_caches: Vec<LdeTwiddles<Field>> = Vec::with_capacity(num_airs);
         let mut max_main_cols = 0usize;
@@ -1437,12 +1491,18 @@ pub trait IsStarkProver<
             .map(|_| Vec::with_capacity(max_lde_size))
             .collect();
 
+        #[cfg(feature = "instruments")]
+        let prepass_elapsed = phase_start.elapsed();
+
         // =====================================================================
         // Round 1, Phase A: Commit all main traces (lightweight)
         // =====================================================================
         // All main trace commitments must be in the transcript before sampling
         // LogUp challenges. Pool buffers are reused across tables.
 
+        #[cfg(feature = "instruments")]
+        let phase_start = Instant::now();
+
         let mut main_commits: Vec<MainCommitData<Field>> = Vec::with_capacity(num_airs);
 
         for ((air, trace, _pub_inputs), twiddles) in
@@ -1473,6 +1533,9 @@ pub trait IsStarkProver<
             });
         }
 
+        #[cfg(feature = "instruments")]
+        let main_commits_elapsed = phase_start.elapsed();
+
         // =====================================================================
         // Round 1, Phase B: Sample shared LogUp challenges
         // =====================================================================
@@ -1499,6 +1562,9 @@ pub trait IsStarkProver<
         // Pass 1: Build aux traces in parallel.
         // Each build_auxiliary_trace has internal parallelism (batch_inverse, par_chunks),
         // but outer parallelism over 12 tables also helps on high-core-count machines.
+        #[cfg(feature = "instruments")]
+        let phase_start = Instant::now();
+
         #[cfg(feature = "parallel")]
         let aux_iter = air_trace_pairs.par_iter_mut();
         #[cfg(not(feature = "parallel"))]
@@ -1513,8 +1579,14 @@ pub trait IsStarkProver<
             })
             .collect();
 
+        #[cfg(feature = "instruments")]
+        let aux_build_elapsed = phase_start.elapsed();
+
         // Pass 2: Sequential fork transcript → extract → LDE → commit.
         // Uses shared aux_pool. Each table gets its own transcript fork.
+        #[cfg(feature = "instruments")]
+        let phase_start = Instant::now();
+
         let mut metadatas: Vec<Round1Metadata<Field, FieldExtension>> =
             Vec::with_capacity(num_airs);
         let mut table_transcripts = Vec::with_capacity(num_airs);
@@ -1537,15 +1609,23 @@ pub trait IsStarkProver<
             let (aux_tree, aux_root) = if air.has_aux_trace() {
                 let num_aux_cols = trace.num_aux_columns;
                 trace.extract_columns_aux_into(&mut aux_pool);
+                #[cfg(feature = "instruments")]
+                let t_sub = Instant::now();
                 Self::expand_pool_to_lde::<FieldExtension>(
                     &mut aux_pool,
                     num_aux_cols,
                     domain,
                     twiddles,
                 );
+                #[cfg(feature = "instruments")]
+                let aux_lde_dur = t_sub.elapsed();
 
+                #[cfg(feature = "instruments")]
+                let t_sub = Instant::now();
                 let (tree, root) = Self::commit_columns_bit_reversed(&aux_pool[..num_aux_cols])
                     .ok_or(ProvingError::EmptyCommitment)?;
+                #[cfg(feature = "instruments")]
+                crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed());
 
                 table_transcript.append_bytes(&root);
                 (Some(Rc::new(tree)), Some(root))
@@ -1567,6 +1647,9 @@ pub trait IsStarkProver<
             table_transcripts.push(table_transcript);
         }
 
+        #[cfg(feature = "instruments")]
+        let aux_commit_elapsed = phase_start.elapsed();
+
         #[cfg(feature = "debug-checks")]
         Self::run_debug_checks(
             &air_trace_pairs,
@@ -1583,6 +1666,12 @@ pub trait IsStarkProver<
         // For each table, recompute LDE into pool buffers, reuse stored Merkle trees,
         // run rounds 2-4 with the table's forked transcript, then drop table data.
 
+        #[cfg(feature = "instruments")]
+        let phase_start = Instant::now();
+        #[cfg(feature = "instruments")]
+        let mut table_timings: Vec<(String, usize, std::time::Duration, crate::instruments::TableSubOps)> =
+            Vec::with_capacity(num_airs);
+
         let mut proofs = Vec::with_capacity(num_airs);
         for (((((air, trace, pub_inputs), metadata), domain), twiddles), table_transcript) in
             air_trace_pairs
@@ -1592,7 +1681,12 @@ pub trait IsStarkProver<
                 .zip(twiddle_caches.iter())
                 .zip(table_transcripts.iter_mut())
         {
+            #[cfg(feature = "instruments")]
+            let table_start = Instant::now();
+
             // Recompute LDE evaluations into pool, reuse stored Merkle trees
+            #[cfg(feature = "instruments")]
+            let lde_start = Instant::now();
             let round_1_result = Self::reconstruct_round1(
                 *air,
                 *trace,
@@ -1602,6 +1696,8 @@ pub trait IsStarkProver<
                 &mut main_pool,
                 &mut aux_pool,
             )?;
+            #[cfg(feature = "instruments")]
+            let lde_dur = lde_start.elapsed();
 
             let proof = Self::prove_rounds_2_to_4(
                 *air,
@@ -1612,6 +1708,19 @@ pub trait IsStarkProver<
             )?;
             proofs.push(proof);
 
+            #[cfg(feature = "instruments")]
+            {
+                let mut sub_ops = crate::instruments::take_round_sub_ops()
+                    .unwrap_or_default();
+                sub_ops.trace_lde += lde_dur;
+                table_timings.push((
+                    air.name().to_string(),
+                    trace.num_rows(),
+                    table_start.elapsed(),
+                    sub_ops,
+                ));
+            }
+
             // Return column Vecs to pool (zero-copy move back). Pool slots that were
             // `take`n in reconstruct_round1 get their buffers back with capacity intact.
             let (main_cols, aux_cols) = round_1_result.lde_trace.into_columns();
@@ -1623,6 +1732,21 @@ pub trait IsStarkProver<
             }
         }
 
+        #[cfg(feature = "instruments")]
+        {
+            // Store timing data for the top-level report in prove_with_options.
+            // Uses a thread-local to avoid changing multi_prove's return type.
+            crate::instruments::store(crate::instruments::MultiProveTiming {
+                prepass: prepass_elapsed,
+                main_commits: main_commits_elapsed,
+                aux_build: aux_build_elapsed,
+                aux_commit: aux_commit_elapsed,
+                rounds_2_4: phase_start.elapsed(),
+                round1_sub: crate::instruments::take_r1_sub(),
+                table_timings,
+            });
+        }
+
         Ok(MultiProof::new(proofs))
     }
 
@@ -1665,11 +1789,6 @@ pub trait IsStarkProver<
         // ==========|   Round 2   |==========
         // ===================================
 
-        #[cfg(feature = "instruments")]
-        println!("- Started round 2: Compute composition polynomial");
-        #[cfg(feature = "instruments")]
-        let timer2 = Instant::now();
-
         // <<<< Receive challenge: 𝛽
         let beta = transcript.sample_field_element();
         let trace_length = domain.interpolation_domain_size;
@@ -1706,26 +1825,18 @@ pub trait IsStarkProver<
         // >>>> Send commitments: [H₁], [H₂]
         transcript.append_bytes(&round_2_result.composition_poly_root);
 
-        #[cfg(feature = "instruments")]
-        let elapsed2 = timer2.elapsed();
-        #[cfg(feature = "instruments")]
-        println!("  Time spent: {:?}", elapsed2);
-
         // ===================================
         // ==========|   Round 3   |==========
         // ===================================
 
-        #[cfg(feature = "instruments")]
-        println!("- Started round 3: Evaluate polynomial in out of domain elements");
-        #[cfg(feature = "instruments")]
-        let timer3 = Instant::now();
-
         // <<<< Receive challenge: z
         let z = transcript.sample_z_ood(
             &domain.lde_roots_of_unity_coset,
             &domain.trace_roots_of_unity,
         );
 
+        #[cfg(feature = "instruments")]
+        let t_r3 = Instant::now();
         let round_3_result = Self::round_3_evaluate_polynomials_in_out_of_domain_element(
             air,
             domain,
@@ -1733,6 +1844,8 @@ pub trait IsStarkProver<
             &round_2_result,
             &z,
         );
+        #[cfg(feature = "instruments")]
+        let round_3_dur = t_r3.elapsed();
 
         // >>>> Send values: tⱼ(zgᵏ)
         let trace_ood_evaluations_columns = round_3_result.trace_ood_evaluations.columns();
@@ -1747,20 +1860,10 @@ pub trait IsStarkProver<
             transcript.append_field_element(element);
         }
 
-        #[cfg(feature = "instruments")]
-        let elapsed3 = timer3.elapsed();
-        #[cfg(feature = "instruments")]
-        println!("  Time spent: {:?}", elapsed3);
-
         // ===================================
         // ==========|   Round 4   |==========
         // ===================================
 
-        #[cfg(feature = "instruments")]
-        println!("- Started round 4: FRI");
-        #[cfg(feature = "instruments")]
-        let timer4 = Instant::now();
-
         // Part of this round is running FRI, which is an interactive
         // protocol on its own. Therefore we pass it the transcript
         // to simulate the interactions with the verifier.
@@ -1774,20 +1877,24 @@ pub trait IsStarkProver<
             transcript,
         );
 
-        #[cfg(feature = "instruments")]
-        let elapsed4 = timer4.elapsed();
-        #[cfg(feature = "instruments")]
-        println!("  Time spent: {:?}", elapsed4);
-
         #[cfg(feature = "instruments")]
         {
-            let total_time = elapsed2 + elapsed3 + elapsed4;
-            println!(
-                " Fraction of proving time per round: {:.4} {:.4} {:.4}",
-                elapsed2.as_nanos() as f64 / total_time.as_nanos() as f64,
-                elapsed3.as_nanos() as f64 / total_time.as_nanos() as f64,
-                elapsed4.as_nanos() as f64 / total_time.as_nanos() as f64
-            );
+            let zero = std::time::Duration::ZERO;
+            let (r2_constraints, r2_fft, r2_merkle) =
+                crate::instruments::take_r2_sub().unwrap_or((zero, zero, zero));
+            let (r4_fft, r4_merkle, r4_deep_comp, r4_queries) =
+                crate::instruments::take_r4_sub().unwrap_or((zero, zero, zero, zero));
+            crate::instruments::store_round_sub_ops(crate::instruments::TableSubOps {
+                trace_lde: std::time::Duration::ZERO, // added by caller from lde_dur
+                constraints: r2_constraints,
+                comp_decompose: r2_fft,
+                comp_commit: r2_merkle,
+                ood: round_3_dur,
+                deep_comp: r4_deep_comp,
+                deep_extend: r4_fft,
+                fri_commit: r4_merkle,
+                queries: r4_queries,
+            });
         }
 
         info!("End proof generation");
diff --git a/prover/Cargo.toml b/prover/Cargo.toml
index 56189724d..dac711002 100644
--- a/prover/Cargo.toml
+++ b/prover/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2024"
 default = ["parallel"]
 parallel = ["stark/parallel", "math/parallel", "crypto/parallel", "dep:rayon"]
 debug-checks = ["stark/debug-checks"]
+instruments = ["stark/instruments"]
 
 [dependencies]
 stark = { path = "../crypto/stark" }
@@ -23,3 +24,7 @@ criterion = { version = "0.5", default-features = false }
 [[bench]]
 name = "vm_prover_benchmark"
 harness = false
+
+[[bench]]
+name = "profile_vm_prover"
+harness = false
diff --git a/prover/benches/profile_vm_prover.rs b/prover/benches/profile_vm_prover.rs
index 87cb19f50..5ec78a1d3 100644
--- a/prover/benches/profile_vm_prover.rs
+++ b/prover/benches/profile_vm_prover.rs
@@ -3,13 +3,17 @@
 // Run with: `samply record cargo bench --bench profile_vm_prover --features parallel`
 // Or with hyperfine: `hyperfine --runs 1 './target/release/deps/profile_vm_prover-*'`
 //
-// Uses all_instructions_64.elf which exercises all supported RISC-V instructions.
+// Default ELF: fib_iterative_372k (~372k steps, realistic workload).
+// Override:    cargo bench --bench profile_vm_prover --features parallel -- <elf_name>
 
 use lambda_vm_prover::test_utils::asm_elf_bytes;
 
 fn main() {
-    let elf_name = "all_instructions_64";
-    let elf_bytes = asm_elf_bytes(elf_name);
+    let elf_name = std::env::args()
+        .skip(1)
+        .find(|a| !a.starts_with('-'))
+        .unwrap_or_else(|| "fib_iterative_372k".to_string());
+    let elf_bytes = asm_elf_bytes(&elf_name);
 
     println!("Starting VM prover profiling...");
     println!("Configuration:");
diff --git a/prover/src/instruments.rs b/prover/src/instruments.rs
new file mode 100644
index 000000000..58954a919
--- /dev/null
+++ b/prover/src/instruments.rs
@@ -0,0 +1,204 @@
+use std::collections::BTreeMap;
+use std::time::Duration;
+
+fn fmt_rows(rows: usize) -> String {
+    if rows >= 1_000_000 {
+        format!("{:.1}M", rows as f64 / 1_000_000.0)
+    } else if rows >= 1_000 {
+        format!("{}K", rows / 1_000)
+    } else {
+        format!("{rows}")
+    }
+}
+
+fn pct(dur: Duration, total: Duration) -> f64 {
+    if total > Duration::ZERO {
+        dur.as_secs_f64() / total.as_secs_f64() * 100.0
+    } else {
+        0.0
+    }
+}
+
+/// Top-level row: % in first column.
+fn row_top(label: &str, dur: Duration, total: Duration) {
+    eprintln!(
+        "  {:<36} {:>7.2}s  {:>5.1}%",
+        label,
+        dur.as_secs_f64(),
+        pct(dur, total),
+    );
+}
+
+/// Sub-level row: % shifted right into its own column.
+fn row_sub(label: &str, dur: Duration, total: Duration) {
+    eprintln!(
+        "  {:<36} {:>7.2}s         {:>5.1}%",
+        label,
+        dur.as_secs_f64(),
+        pct(dur, total),
+    );
+}
+
+/// Strip the `[N]` suffix to get the base table name.
+fn base_name(name: &str) -> &str {
+    name.find('[').map_or(name, |i| &name[..i])
+}
+
+struct MergedTable {
+    total_dur: Duration,
+    total_rows: usize,
+    count: usize,
+    sub_ops: stark::instruments::TableSubOps,
+}
+
+/// Print a unified timing report to stderr.
+pub fn print_report(
+    execute: Duration,
+    trace_build: Duration,
+    air_construction: Duration,
+    _prove: Duration,
+    total: Duration,
+) {
+    let mp = stark::instruments::take();
+
+    eprintln!();
+    eprintln!("=== PROVER TIMING ===");
+    eprintln!(
+        "  {:<36} {:>8}  {:>5}",
+        "Phase", "Wall", "%",
+    );
+    eprintln!("  {}", "─".repeat(58));
+
+    row_top("Execute", execute, total);
+    row_top("Trace build", trace_build, total);
+    row_top("AIR construction", air_construction, total);
+
+    if let Some(mp) = mp {
+        let round1 = mp.main_commits + mp.aux_build + mp.aux_commit;
+
+        row_top("Pre-pass (domains/twiddles)", mp.prepass, total);
+        row_top("Round 1", round1, total);
+        row_sub("  Main trace commits", mp.main_commits, total);
+        row_sub("    expand_pool_to_lde", mp.round1_sub.main_lde, total);
+        row_sub("    commit (Merkle)", mp.round1_sub.main_merkle, total);
+        row_sub("  Aux trace build (parallel)", mp.aux_build, total);
+        row_sub("  Aux trace commit", mp.aux_commit, total);
+        row_sub("    expand_pool_to_lde", mp.round1_sub.aux_lde, total);
+        row_sub("    commit (Merkle)", mp.round1_sub.aux_merkle, total);
+        row_top("Rounds 2\u{2013}4", mp.rounds_2_4, total);
+
+        // Merge split tables: MEMW[0..4] → MEMW x5
+        let mut merged: BTreeMap<String, MergedTable> = BTreeMap::new();
+        for (name, rows, dur, sub_ops) in &mp.table_timings {
+            let base = base_name(name).to_string();
+            let entry = merged.entry(base).or_insert(MergedTable {
+                total_dur: Duration::ZERO,
+                total_rows: 0,
+                count: 0,
+                sub_ops: stark::instruments::TableSubOps::default(),
+            });
+            entry.total_dur += *dur;
+            entry.total_rows += rows;
+            entry.count += 1;
+            entry.sub_ops.trace_lde += sub_ops.trace_lde;
+            entry.sub_ops.constraints += sub_ops.constraints;
+            entry.sub_ops.comp_decompose += sub_ops.comp_decompose;
+            entry.sub_ops.comp_commit += sub_ops.comp_commit;
+            entry.sub_ops.ood += sub_ops.ood;
+            entry.sub_ops.deep_comp += sub_ops.deep_comp;
+            entry.sub_ops.deep_extend += sub_ops.deep_extend;
+            entry.sub_ops.fri_commit += sub_ops.fri_commit;
+            entry.sub_ops.queries += sub_ops.queries;
+        }
+
+        let mut sorted: Vec<_> = merged.into_iter().collect();
+        sorted.sort_by(|a, b| b.1.total_dur.cmp(&a.1.total_dur));
+
+        let threshold = total.as_secs_f64() * 0.02;
+        let mut others_dur = Duration::ZERO;
+        let mut others_count = 0usize;
+
+        for (name, t) in &sorted {
+            if t.total_dur.as_secs_f64() >= threshold {
+                let display_name = if t.count > 1 {
+                    format!("{name} x{}", t.count)
+                } else {
+                    name.clone()
+                };
+                let label = format!(
+                    "    {:<18} {:>6}",
+                    display_name,
+                    fmt_rows(t.total_rows),
+                );
+                row_sub(&label, t.total_dur, total);
+            } else {
+                others_dur += t.total_dur;
+                others_count += 1;
+            }
+        }
+        if others_count > 0 {
+            let label = format!("    ({others_count} others)");
+            row_sub(&label, others_dur, total);
+        }
+
+        // Sub-operation totals across all tables
+        let mut total_trace_lde = Duration::ZERO;
+        let mut total_constraints = Duration::ZERO;
+        let mut total_comp_decompose = Duration::ZERO;
+        let mut total_comp_commit = Duration::ZERO;
+        let mut total_ood = Duration::ZERO;
+        let mut total_deep_comp = Duration::ZERO;
+        let mut total_deep_extend = Duration::ZERO;
+        let mut total_fri_commit = Duration::ZERO;
+        let mut total_queries = Duration::ZERO;
+        for (_, t) in &sorted {
+            total_trace_lde += t.sub_ops.trace_lde;
+            total_constraints += t.sub_ops.constraints;
+            total_comp_decompose += t.sub_ops.comp_decompose;
+            total_comp_commit += t.sub_ops.comp_commit;
+            total_ood += t.sub_ops.ood;
+            total_deep_comp += t.sub_ops.deep_comp;
+            total_deep_extend += t.sub_ops.deep_extend;
+            total_fri_commit += t.sub_ops.fri_commit;
+            total_queries += t.sub_ops.queries;
+        }
+
+        let sub_ops_sum = total_trace_lde + total_constraints + total_comp_decompose
+            + total_comp_commit + total_ood + total_deep_comp + total_deep_extend
+            + total_fri_commit + total_queries;
+        if sub_ops_sum > Duration::ZERO {
+            let mut sub_ops: Vec<(&str, Duration)> = vec![
+                ("R1  expand_pool_to_lde", total_trace_lde),
+                ("R2  evaluate", total_constraints),
+                ("R2  decompose_and_extend_d2", total_comp_decompose),
+                ("R2  commit_composition_poly", total_comp_commit),
+                ("R3  OOD evaluation", total_ood),
+                ("R4  deep_composition_poly_evals", total_deep_comp),
+                ("R4  interpolate+evaluate_fft", total_deep_extend),
+                ("R4  fri::commit_phase", total_fri_commit),
+                ("R4  queries & openings", total_queries),
+            ];
+            sub_ops.sort_by(|a, b| b.1.cmp(&a.1));
+            eprintln!(
+                "  {}",
+                "    \u{2500}\u{2500} sub-operation totals (all tables) \u{2500}\u{2500}",
+            );
+            for (label, dur) in &sub_ops {
+                row_sub(&format!("    {label}"), *dur, total);
+            }
+        }
+
+        // Cross-round totals: all FFT work and all Merkle work
+        let total_fft = mp.round1_sub.main_lde + mp.round1_sub.aux_lde
+            + total_trace_lde + total_comp_decompose + total_deep_extend;
+        let total_merkle = mp.round1_sub.main_merkle + mp.round1_sub.aux_merkle
+            + total_comp_commit + total_fri_commit;
+        eprintln!();
+        eprintln!("  {:<36} {:>7.2}s  {:>5.1}%", "Total FFT", total_fft.as_secs_f64(), pct(total_fft, total));
+        eprintln!("  {:<36} {:>7.2}s  {:>5.1}%", "Total Merkle", total_merkle.as_secs_f64(), pct(total_merkle, total));
+    }
+
+    eprintln!("  {}", "─".repeat(58));
+    eprintln!("  {:<36} {:>7.2}s", "TOTAL", total.as_secs_f64());
+    eprintln!();
+}
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 7539d327a..01f51934e 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -13,6 +13,8 @@
 pub mod constraints;
 #[cfg(feature = "debug-checks")]
 mod debug_report;
+#[cfg(feature = "instruments")]
+pub mod instruments;
 pub mod tables;
 pub mod test_utils;
 pub mod tests;
@@ -342,15 +344,37 @@ pub fn prove_with_options(
     proof_options: &ProofOptions,
     max_rows: &MaxRowsConfig,
 ) -> Result<VmProof, Error> {
+    #[cfg(feature = "instruments")]
+    let total_start = std::time::Instant::now();
+
+    // Phase 1: Execute (ELF load + run)
+    #[cfg(feature = "instruments")]
+    let phase_start = std::time::Instant::now();
+
     let program = Elf::load(elf_bytes).map_err(|e| Error::ElfLoad(format!("{e}")))?;
     let executor = Executor::new(&program, vec![]).map_err(|e| Error::Execution(format!("{e}")))?;
     let result = executor
         .run()
         .map_err(|e| Error::Execution(format!("{e}")))?;
 
+    #[cfg(feature = "instruments")]
+    let execute_elapsed = phase_start.elapsed();
+
+    // Phase 2: Trace build
+    #[cfg(feature = "instruments")]
+    let phase_start = std::time::Instant::now();
+
     // Generate all traces from ELF and execution logs.
     // Page tables are derived from the prover's MemoryState (all accessed pages).
     let mut traces = Traces::from_elf_and_logs(&program, &result.logs, max_rows)?;
+
+    #[cfg(feature = "instruments")]
+    let trace_build_elapsed = phase_start.elapsed();
+
+    // Phase 3: AIR construction
+    #[cfg(feature = "instruments")]
+    let phase_start = std::time::Instant::now();
+
     let table_counts = traces.table_counts();
     let airs = VmAirs::new(
         &program,
@@ -360,14 +384,33 @@ pub fn prove_with_options(
         &table_counts,
     );
 
+    #[cfg(feature = "instruments")]
+    let air_elapsed = phase_start.elapsed();
+
     let runtime_page_ranges = traces.runtime_page_ranges();
 
+    // Phase 4: Prove (multi_prove)
+    #[cfg(feature = "instruments")]
+    let phase_start = std::time::Instant::now();
+
     let proof = Prover::multi_prove(
         airs.air_trace_pairs(&mut traces),
         &mut DefaultTranscript::<E>::new(&[]),
     )
     .map_err(|e| Error::Prover(format!("{e:?}")))?;
 
+    #[cfg(feature = "instruments")]
+    {
+        let prove_elapsed = phase_start.elapsed();
+        instruments::print_report(
+            execute_elapsed,
+            trace_build_elapsed,
+            air_elapsed,
+            prove_elapsed,
+            total_start.elapsed(),
+        );
+    }
+
     Ok(VmProof {
         proof,
         runtime_page_ranges,

From af6aab3a0e6f747356a81e176e75dc5662dfeb9a Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Sat, 28 Mar 2026 17:13:50 +0100
Subject: [PATCH 09/34] Fix syscall number

---
 bench_vs/lambda/fibonacci/src/main.rs | 2 +-
 bench_vs/run.sh                       | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/bench_vs/lambda/fibonacci/src/main.rs b/bench_vs/lambda/fibonacci/src/main.rs
index 8f54cf604..ff06237bc 100644
--- a/bench_vs/lambda/fibonacci/src/main.rs
+++ b/bench_vs/lambda/fibonacci/src/main.rs
@@ -16,7 +16,7 @@ fn halt(code: u64) -> ! {
         core::arch::asm!(
             "ecall",
             in("a0") code,
-            in("a7") 5u64,
+            in("a7") 93u64,
             options(noreturn),
         );
     }
diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index 4aa249a5d..8c3cb8179 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -107,7 +107,11 @@ run_one() {
         LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench"
 
         echo -e "  ${GREEN}[Lambda VM] Proving...${NC}"
-        LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>/dev/null)
+        LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>"$TMP_DIR/lambda_err.txt")
+        if [ $? -ne 0 ]; then
+            echo -e "  ${RED}[Lambda VM] FAILED:${NC}"
+            cat "$TMP_DIR/lambda_err.txt"
+        fi
         lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
         echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC}"
     fi

From 687af82fa77e456f0d93c4fd40deb174fac6a72f Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 31 Mar 2026 19:49:01 -0300
Subject: [PATCH 10/34] Add runtime private inputs, nightly CI workflow, and
 500M step projection to bench_vs

---
 .github/workflows/bench-vs-nightly.yml |  58 +++
 bench_vs/lambda/fibonacci/build.rs     |  10 -
 bench_vs/lambda/fibonacci/src/main.rs  |  49 ++-
 bench_vs/run.sh                        | 481 +++++++++++++++++++++----
 bin/cli/src/main.rs                    |  68 +++-
 crypto/stark/src/lib.rs                |   2 -
 prover/src/lib.rs                      |  22 +-
 7 files changed, 590 insertions(+), 100 deletions(-)
 create mode 100644 .github/workflows/bench-vs-nightly.yml
 delete mode 100644 bench_vs/lambda/fibonacci/build.rs

diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml
new file mode 100644
index 000000000..50e88bb63
--- /dev/null
+++ b/.github/workflows/bench-vs-nightly.yml
@@ -0,0 +1,58 @@
+name: Bench Vs Nightly
+
+on:
+  schedule:
+    # 03:00 America/Argentina/Buenos_Aires = 06:00 UTC
+    - cron: "0 6 * * *"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: bench-vs-nightly-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  bench-vs:
+    runs-on: [self-hosted, bench]
+    timeout-minutes: 720
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Rust Environment
+        uses: ./.github/actions/setup-rust
+
+      - name: Add cargo to PATH
+        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Add SP1 to PATH
+        run: echo "$HOME/.sp1/bin" >> "$GITHUB_PATH"
+
+      - name: Install SP1 toolchain
+        run: |
+          export PATH="$HOME/.cargo/bin:$HOME/.sp1/bin:$PATH"
+          if ! cargo prove --version >/dev/null 2>&1; then
+            curl -L https://sp1up.succinct.xyz | bash
+            export PATH="$HOME/.sp1/bin:$PATH"
+            sp1up
+          fi
+          cargo prove --version
+
+      - name: Run nightly benchmark
+        run: |
+          bash ./bench_vs/run.sh \
+            -n 1000000 2000000 4000000 8000000 \
+            --report-dir bench_vs_artifacts \
+            --no-color
+
+      - name: Upload nightly benchmark artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: bench-vs-nightly-${{ github.sha }}
+          path: bench_vs_artifacts
+          retention-days: 90
+
+      - name: Publish summary
+        run: cat bench_vs_artifacts/summary.md >> "$GITHUB_STEP_SUMMARY"
diff --git a/bench_vs/lambda/fibonacci/build.rs b/bench_vs/lambda/fibonacci/build.rs
deleted file mode 100644
index 5c189eadb..000000000
--- a/bench_vs/lambda/fibonacci/build.rs
+++ /dev/null
@@ -1,10 +0,0 @@
-use std::env;
-use std::fs;
-use std::path::Path;
-
-fn main() {
-    let n = env::var("BENCH_N").unwrap_or_else(|_| "1000".to_string());
-    let out_dir = env::var("OUT_DIR").unwrap();
-    fs::write(Path::new(&out_dir).join("n.txt"), &n).unwrap();
-    println!("cargo:rerun-if-env-changed=BENCH_N");
-}
diff --git a/bench_vs/lambda/fibonacci/src/main.rs b/bench_vs/lambda/fibonacci/src/main.rs
index ff06237bc..e9e673e0c 100644
--- a/bench_vs/lambda/fibonacci/src/main.rs
+++ b/bench_vs/lambda/fibonacci/src/main.rs
@@ -1,22 +1,52 @@
 #![no_std]
 #![no_main]
 
+use core::arch::asm;
 use core::panic::PanicInfo;
 
+const SYSCALL_GET_PRIVATE_INPUTS: u64 = 4;
+const SYSCALL_COMMIT: u64 = 64;
+const SYSCALL_HALT: u64 = 93;
+
 #[panic_handler]
 fn panic(_info: &PanicInfo) -> ! {
     loop {}
 }
 
-const N: u64 = include!(concat!(env!("OUT_DIR"), "/n.txt"));
+fn read_n() -> u64 {
+    let mut input = [0u8; 12];
+
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") input.as_mut_ptr(),
+            in("a7") SYSCALL_GET_PRIVATE_INPUTS,
+        );
+    }
+
+    let mut n_bytes = [0u8; 8];
+    n_bytes.copy_from_slice(&input[4..12]);
+    u64::from_le_bytes(n_bytes)
+}
 
-#[inline(never)]
-fn halt(code: u64) -> ! {
+fn commit(bytes: &[u8]) {
     unsafe {
-        core::arch::asm!(
+        asm!(
             "ecall",
-            in("a0") code,
-            in("a7") 93u64,
+            in("a0") 1u64,
+            in("a1") bytes.as_ptr(),
+            in("a2") bytes.len(),
+            in("a7") SYSCALL_COMMIT,
+        );
+    }
+}
+
+fn halt() -> ! {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 0u64,
+            in("a7") SYSCALL_HALT,
             options(noreturn),
         );
     }
@@ -24,12 +54,15 @@ fn halt(code: u64) -> ! {
 
 #[unsafe(no_mangle)]
 pub fn main() -> ! {
+    let n = read_n();
     let mut a: u64 = 0;
     let mut b: u64 = 1;
-    for _ in 0..N {
+    for _ in 0..n {
         let c = a.wrapping_add(b);
         a = b;
         b = c;
     }
-    halt(b)
+
+    commit(&b.to_le_bytes());
+    halt()
 }
diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index 8c3cb8179..5592e095c 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -2,20 +2,23 @@
 # Benchmark: Lambda VM vs SP1 v6 — Fibonacci proving time comparison.
 #
 # Usage: ./bench_vs/run.sh [-n 1000 50000 100000] [--lambda-only | --sp1-only]
+#                         [--report-dir DIR] [--no-color]
 #
 # Without -n, runs the default series: 1000 10000 100000 300000
-# With -n, runs the specified values (space-separated): -n 1000 50000
 #
 # Prerequisites:
-#   - Lambda VM CLI built: cargo build --release -p cli
-#   - SP1 toolchain installed: curl -L https://sp1up.succinct.xyz | bash && sp1up
-#   - Rust nightly toolchain: rustup toolchain install nightly
+#   - Lambda VM CLI build dependencies available
+#   - SP1 toolchain installed (or available in PATH for CI)
+#   - Rust stable + nightly-2026-02-01 installed
 
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
 TMP_DIR="/tmp/bench_fib"
+REPORT_DIR=""
+NO_COLOR=false
+TARGET_STEPS=500000000
 
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -23,31 +26,53 @@ YELLOW='\033[1;33m'
 BOLD='\033[1m'
 NC='\033[0m'
 
-# --- Defaults ----------------------------------------------------------------
+# --- Defaults ---------------------------------------------------------------
 DEFAULT_SERIES=(1000 10000 100000 300000)
 SERIES=()
 RUN_LAMBDA=true
 RUN_SP1=true
 
-# --- Parse args --------------------------------------------------------------
+# --- Parse args -------------------------------------------------------------
 while [[ $# -gt 0 ]]; do
     case $1 in
-        -n) shift
+        -n)
+            shift
             while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
-                SERIES+=("$1"); shift
-            done ;;
-        --lambda-only) RUN_SP1=false; shift ;;
-        --sp1-only) RUN_LAMBDA=false; shift ;;
+                SERIES+=("$1")
+                shift
+            done
+            ;;
+        --lambda-only)
+            RUN_SP1=false
+            shift
+            ;;
+        --sp1-only)
+            RUN_LAMBDA=false
+            shift
+            ;;
+        --report-dir)
+            REPORT_DIR=$2
+            shift 2
+            ;;
+        --no-color)
+            NO_COLOR=true
+            shift
+            ;;
         -h|--help)
-            echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only]"
+            echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--no-color]"
             echo ""
-            echo "  -n N1 N2 ...    Fibonacci iteration counts (space-separated)"
-            echo "                  Default series: ${DEFAULT_SERIES[*]}"
-            echo "  --lambda-only   Only run Lambda VM benchmark"
-            echo "  --sp1-only      Only run SP1 benchmark"
+            echo "  -n N1 N2 ...      Fibonacci iteration counts (space-separated)"
+            echo "                    Default series: ${DEFAULT_SERIES[*]}"
+            echo "  --lambda-only     Only run Lambda VM benchmark"
+            echo "  --sp1-only        Only run SP1 benchmark"
+            echo "  --report-dir DIR  Write TSV, metrics, markdown summary, and raw outputs"
+            echo "  --no-color        Disable ANSI colors"
             exit 0
             ;;
-        *) echo "Unknown option: $1"; exit 1 ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
     esac
 done
 
@@ -55,21 +80,152 @@ if [ ${#SERIES[@]} -eq 0 ]; then
     SERIES=("${DEFAULT_SERIES[@]}")
 fi
 
+if ! $RUN_LAMBDA && ! $RUN_SP1; then
+    echo "At least one prover must be enabled"
+    exit 1
+fi
+
+if $NO_COLOR; then
+    RED=''
+    GREEN=''
+    YELLOW=''
+    BOLD=''
+    NC=''
+fi
+
+mkdir -p "$TMP_DIR"
+rm -rf "$TMP_DIR"/*
+
+if [ -n "$REPORT_DIR" ]; then
+    mkdir -p "$REPORT_DIR/raw"
+fi
+
+join_slash() {
+    local joined=""
+    local value
+    for value in "$@"; do
+        joined="${joined:+$joined/}$value"
+    done
+    printf "%s\n" "$joined"
+}
+
+fit_series() {
+    local steps_slash=$1
+    local values_slash=$2
+
+    awk -v steps="$steps_slash" -v values="$values_slash" 'BEGIN {
+        n = split(steps, xs, "/")
+        m = split(values, ys, "/")
+        if (n == 0 || n != m) {
+            print "0 0 0.0000"
+            exit
+        }
+
+        sx = 0; sy = 0; sxy = 0; sx2 = 0
+        for (i = 1; i <= n; i++) {
+            x = xs[i] / 1000000
+            y = ys[i] + 0
+            sx += x
+            sy += y
+            sxy += x * y
+            sx2 += x * x
+        }
+
+        d = n * sx2 - sx * sx
+        if (d == 0) {
+            intercept = sy / n
+            printf "0 %.6f 0.0000\n", intercept
+            exit
+        }
+
+        slope = (n * sxy - sx * sy) / d
+        intercept = (sy - slope * sx) / n
+
+        my = sy / n
+        ss_tot = 0
+        ss_res = 0
+        for (i = 1; i <= n; i++) {
+            x = xs[i] / 1000000
+            y = ys[i] + 0
+            pred = slope * x + intercept
+            ss_res += (y - pred) * (y - pred)
+            ss_tot += (y - my) * (y - my)
+        }
+
+        r2 = (ss_tot > 0) ? 1 - ss_res / ss_tot : 0
+        if (r2 < 0) {
+            r2 = 0
+        }
+
+        printf "%.6f %.6f %.4f\n", slope, intercept, r2
+    }'
+}
+
+project_series() {
+    local slope=$1
+    local intercept=$2
+    local target_steps=$3
+
+    awk -v slope="$slope" -v intercept="$intercept" -v target="$target_steps" 'BEGIN {
+        projected = slope * (target / 1000000) + intercept
+        if (projected < 0) {
+            projected = 0
+        }
+        printf "%.3f\n", projected
+    }'
+}
+
+format_hours() {
+    local seconds=$1
+    awk -v value="$seconds" 'BEGIN { printf "%.2f\n", value / 3600 }'
+}
+
+write_u64_le() {
+    local value=$1
+    local output_path=$2
+
+    python3 - "$value" "$output_path" <<'PY'
+import struct
+import sys
+
+value = int(sys.argv[1])
+path = sys.argv[2]
+
+with open(path, "wb") as fh:
+    fh.write(struct.pack("<Q", value))
+PY
+}
+
 echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}"
 echo -e "Series: ${YELLOW}${SERIES[*]}${NC}"
 echo ""
 
-rm -rf "$TMP_DIR" && mkdir -p "$TMP_DIR"
-
-# --- Pre-build ---------------------------------------------------------------
+# --- Pre-build --------------------------------------------------------------
 
 CLI="$ROOT_DIR/target/release/cli"
 LAMBDA_DIR="$SCRIPT_DIR/lambda/fibonacci"
 TARGET_SPEC="$ROOT_DIR/executor/programs/riscv64im-lambda-vm-elf.json"
+LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench"
 
-if $RUN_LAMBDA && [ ! -f "$CLI" ]; then
-    echo -e "${YELLOW}[Lambda VM] CLI not found, building...${NC}"
-    cargo build --release -p cli --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1
+if $RUN_LAMBDA; then
+    echo -e "${GREEN}[Lambda VM] Building CLI...${NC}"
+    cargo build --release -p cli --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -5
+fi
+
+if $RUN_LAMBDA; then
+    echo -e "${GREEN}[Lambda VM] Building fibonacci prover...${NC}"
+    (
+        cd "$LAMBDA_DIR" && \
+        cargo +nightly-2026-02-01 build --release \
+            --target "$TARGET_SPEC" \
+            -Z build-std=core \
+            -Z build-std-features=compiler-builtins-mem \
+            -Z json-target-spec 2>&1 | tail -5
+    )
+    if [ ! -f "$LAMBDA_ELF" ]; then
+        echo -e "${RED}[Lambda VM] Build failed — fibonacci-bench ELF not found${NC}"
+        exit 1
+    fi
 fi
 
 SP1_BIN=""
@@ -84,101 +240,284 @@ if $RUN_SP1; then
     fi
 fi
 
-# --- Run one benchmark --------------------------------------------------------
+# --- Run benchmark series ---------------------------------------------------
+
+RESULT_N=()
+RESULT_LAMBDA=()
+RESULT_SP1=()
+RESULT_SP1_CYCLES=()
+RESULT_RATIO=()
 
-# Arrays to collect results for the summary table
-declare -a RESULT_N RESULT_LAMBDA RESULT_SP1
+LAMBDA_STEPS=()
+LAMBDA_TIMES=()
+SP1_STEPS=()
+SP1_TIMES=()
+
+if [ -n "$REPORT_DIR" ]; then
+    printf "n\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv"
+fi
 
 run_one() {
-    local N=$1
-    echo ""
-    echo -e "${BOLD}--- n=${N} ---${NC}"
+    local n=$1
+    local lambda_time="n/a"
+    local sp1_time="n/a"
+    local sp1_cycles="n/a"
+    local ratio="n/a"
 
-    local lambda_time=""
-    local sp1_time=""
-    local sp1_cycles=""
+    echo ""
+    echo -e "${BOLD}--- n=${n} ---${NC}"
 
     if $RUN_LAMBDA; then
-        echo -e "  ${GREEN}[Lambda VM] Building (n=${N})...${NC}"
-        (cd "$LAMBDA_DIR" && BENCH_N="$N" cargo +nightly build --release \
-            --target "$TARGET_SPEC" \
-            -Z build-std=core -Z build-std-features=compiler-builtins-mem 2>&1 \
-            -Z json-target-spec 2>&1 | tail -1)
-        LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench"
+        local input_file="$TMP_DIR/lambda_${n}.bin"
+        local proof_file="$TMP_DIR/lambda_${n}.proof"
+        local stderr_file="$TMP_DIR/lambda_${n}.stderr"
+        write_u64_le "$n" "$input_file"
 
         echo -e "  ${GREEN}[Lambda VM] Proving...${NC}"
-        LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>"$TMP_DIR/lambda_err.txt")
-        if [ $? -ne 0 ]; then
+        local lambda_output
+        if ! lambda_output=$("$CLI" prove "$LAMBDA_ELF" -o "$proof_file" --private-input "$input_file" --time 2>"$stderr_file"); then
             echo -e "  ${RED}[Lambda VM] FAILED:${NC}"
-            cat "$TMP_DIR/lambda_err.txt"
+            cat "$stderr_file"
+            exit 1
         fi
-        lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
+        rm -f "$proof_file"
+
+        lambda_time=$(echo "$lambda_output" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
+        if [ -z "$lambda_time" ]; then
+            echo -e "  ${RED}[Lambda VM] FAILED: could not parse proving time${NC}"
+            printf "%s\n" "$lambda_output"
+            exit 1
+        fi
+
         echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC}"
+        LAMBDA_STEPS+=("$n")
+        LAMBDA_TIMES+=("$lambda_time")
+
+        if [ -n "$REPORT_DIR" ]; then
+            printf "%s\n" "$lambda_output" > "$REPORT_DIR/raw/lambda_${n}.stdout"
+            cp "$stderr_file" "$REPORT_DIR/raw/lambda_${n}.stderr"
+        fi
     fi
 
     if $RUN_SP1; then
         echo -e "  ${GREEN}[SP1 v6] Proving...${NC}"
-        SP1_OUTPUT=$("$SP1_BIN" "$N" 2>/dev/null)
-        sp1_time=$(echo "$SP1_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
-        sp1_cycles=$(echo "$SP1_OUTPUT" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*')
+        local sp1_output_file="$TMP_DIR/sp1_${n}.stdout"
+        if ! "$SP1_BIN" "$n" > "$sp1_output_file" 2>&1; then
+            echo -e "  ${RED}[SP1 v6] FAILED:${NC}"
+            cat "$sp1_output_file"
+            exit 1
+        fi
+
+        sp1_time=$(grep -o 'Proving time: [0-9.]*s' "$sp1_output_file" | grep -o '[0-9.]*')
+        sp1_cycles=$(grep -o 'Cycles: [0-9]*' "$sp1_output_file" | grep -o '[0-9]*')
+        if [ -z "$sp1_time" ] || [ -z "$sp1_cycles" ]; then
+            echo -e "  ${RED}[SP1 v6] FAILED: could not parse output${NC}"
+            cat "$sp1_output_file"
+            exit 1
+        fi
+
         echo -e "  SP1 v6:    ${BOLD}${sp1_time}s${NC} (${sp1_cycles} cycles)"
+        SP1_STEPS+=("$n")
+        SP1_TIMES+=("$sp1_time")
+
+        if [ -n "$REPORT_DIR" ]; then
+            cp "$sp1_output_file" "$REPORT_DIR/raw/sp1_${n}.stdout"
+        fi
     fi
 
-    RESULT_N+=("$N")
-    RESULT_LAMBDA+=("${lambda_time:-n/a}")
-    RESULT_SP1+=("${sp1_time:-n/a}")
-}
+    if [ "$lambda_time" != "n/a" ] && [ "$sp1_time" != "n/a" ]; then
+        ratio=$(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { printf "%.3f", lambda / sp1 }')
+    fi
+
+    RESULT_N+=("$n")
+    RESULT_LAMBDA+=("$lambda_time")
+    RESULT_SP1+=("$sp1_time")
+    RESULT_SP1_CYCLES+=("$sp1_cycles")
+    RESULT_RATIO+=("$ratio")
 
-# --- Run series ---------------------------------------------------------------
+    if [ -n "$REPORT_DIR" ]; then
+        printf "%s\t%s\t%s\t%s\t%s\n" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" "$ratio" >> "$REPORT_DIR/results.tsv"
+    fi
+}
 
-for N in "${SERIES[@]}"; do
-    run_one "$N"
+for n in "${SERIES[@]}"; do
+    run_one "$n"
 done
 
-# --- Summary table ------------------------------------------------------------
+# --- Projection -------------------------------------------------------------
+
+LAMBDA_SLOPE=""
+LAMBDA_INTERCEPT=""
+LAMBDA_R2=""
+LAMBDA_PROJECTED_S=""
+LAMBDA_PROJECTED_H=""
+
+SP1_SLOPE=""
+SP1_INTERCEPT=""
+SP1_R2=""
+SP1_PROJECTED_S=""
+SP1_PROJECTED_H=""
+
+compute_projection() {
+    local label=$1
+    local steps_slash=$2
+    local times_slash=$3
+    local slope intercept r2 projected_s projected_h
+
+    if [ -z "$steps_slash" ] || [ -z "$times_slash" ]; then
+        return 0
+    fi
+
+    read -r slope intercept r2 <<< "$(fit_series "$steps_slash" "$times_slash")"
+    projected_s=$(project_series "$slope" "$intercept" "$TARGET_STEPS")
+    projected_h=$(format_hours "$projected_s")
+
+    case "$label" in
+        lambda)
+            LAMBDA_SLOPE=$slope
+            LAMBDA_INTERCEPT=$intercept
+            LAMBDA_R2=$r2
+            LAMBDA_PROJECTED_S=$projected_s
+            LAMBDA_PROJECTED_H=$projected_h
+            ;;
+        sp1)
+            SP1_SLOPE=$slope
+            SP1_INTERCEPT=$intercept
+            SP1_R2=$r2
+            SP1_PROJECTED_S=$projected_s
+            SP1_PROJECTED_H=$projected_h
+            ;;
+    esac
+}
+
+if $RUN_LAMBDA && [ ${#LAMBDA_STEPS[@]} -gt 0 ]; then
+    compute_projection "lambda" "$(join_slash "${LAMBDA_STEPS[@]}")" "$(join_slash "${LAMBDA_TIMES[@]}")"
+fi
+if $RUN_SP1 && [ ${#SP1_STEPS[@]} -gt 0 ]; then
+    compute_projection "sp1" "$(join_slash "${SP1_STEPS[@]}")" "$(join_slash "${SP1_TIMES[@]}")"
+fi
+
+# --- Summary table ----------------------------------------------------------
 
 echo ""
 echo -e "${BOLD}=== Summary ===${NC}"
 echo -e "Program: Fibonacci (u64 wrapping)"
 echo ""
 
-# Header
 if $RUN_LAMBDA && $RUN_SP1; then
-    printf "  %-10s  %12s  %12s  %8s\n" "n" "Lambda VM" "SP1 v6" "Ratio"
-    printf "  %-10s  %12s  %12s  %8s\n" "---" "---------" "------" "-----"
+    printf "  %-10s  %12s  %12s  %12s  %8s\n" "n" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio"
+    printf "  %-10s  %12s  %12s  %12s  %8s\n" "---" "---------" "------" "----------" "-----"
 elif $RUN_LAMBDA; then
     printf "  %-10s  %12s\n" "n" "Lambda VM"
     printf "  %-10s  %12s\n" "---" "---------"
 else
-    printf "  %-10s  %12s\n" "n" "SP1 v6"
-    printf "  %-10s  %12s\n" "---" "------"
+    printf "  %-10s  %12s  %12s\n" "n" "SP1 v6" "SP1 cycles"
+    printf "  %-10s  %12s  %12s\n" "---" "------" "----------"
 fi
 
 for i in "${!RESULT_N[@]}"; do
     n="${RESULT_N[$i]}"
-    lt="${RESULT_LAMBDA[$i]}"
-    st="${RESULT_SP1[$i]}"
+    lambda_time="${RESULT_LAMBDA[$i]}"
+    sp1_time="${RESULT_SP1[$i]}"
+    sp1_cycles="${RESULT_SP1_CYCLES[$i]}"
+    ratio="${RESULT_RATIO[$i]}"
 
     if $RUN_LAMBDA && $RUN_SP1; then
-        if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then
-            RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $lt / $st}")
-            if (( $(LC_NUMERIC=C awk "BEGIN {print ($lt > $st)}") )); then
-                RATIO="${RED}${RATIO}${NC}"
+        if [ "$ratio" != "n/a" ]; then
+            ratio_colored=$(LC_NUMERIC=C awk -v ratio="$ratio" 'BEGIN { printf "%.1fx", ratio }')
+            if (( $(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { print (lambda > sp1) }') )); then
+                ratio_colored="${RED}${ratio_colored}${NC}"
             else
-                RATIO="${GREEN}${RATIO}${NC}"
+                ratio_colored="${GREEN}${ratio_colored}${NC}"
             fi
-            printf "  %-10s  %11ss  %11ss  " "$n" "$lt" "$st"
-            echo -e "$RATIO"
+            printf "  %-10s  %11ss  %11ss  %12s  " "$n" "$lambda_time" "$sp1_time" "$sp1_cycles"
+            echo -e "$ratio_colored"
         else
-            printf "  %-10s  %12s  %12s  %8s\n" "$n" "${lt}s" "${st}s" "-"
+            printf "  %-10s  %12s  %12s  %12s  %8s\n" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-"
         fi
     elif $RUN_LAMBDA; then
-        printf "  %-10s  %11ss\n" "$n" "$lt"
+        printf "  %-10s  %11ss\n" "$n" "$lambda_time"
     else
-        printf "  %-10s  %11ss\n" "$n" "$st"
+        printf "  %-10s  %11ss  %12s\n" "$n" "$sp1_time" "$sp1_cycles"
     fi
 done
 
 echo ""
-echo -e "Green ratio = Lambda VM faster, Red = SP1 faster"
+if $RUN_LAMBDA && $RUN_SP1; then
+    echo -e "Green ratio = Lambda VM faster, Red = SP1 faster"
+fi
 echo "Raw data in $TMP_DIR/"
+
+if [ -n "$LAMBDA_PROJECTED_S" ] || [ -n "$SP1_PROJECTED_S" ]; then
+    echo ""
+    echo -e "${BOLD}=== Linear Projection to 500M Steps ===${NC}"
+    if [ -n "$LAMBDA_PROJECTED_S" ]; then
+        echo "  Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R²=${LAMBDA_R2}"
+    fi
+    if [ -n "$SP1_PROJECTED_S" ]; then
+        echo "  SP1 v6:    ${SP1_PROJECTED_S}s (${SP1_PROJECTED_H}h), R²=${SP1_R2}"
+    fi
+fi
+
+# --- Machine-readable report ------------------------------------------------
+
+if [ -n "$REPORT_DIR" ]; then
+    {
+        echo "target_steps=$TARGET_STEPS"
+        echo "series=$(join_slash "${RESULT_N[@]}")"
+        echo "lambda_times=$(join_slash "${RESULT_LAMBDA[@]}")"
+        echo "sp1_times=$(join_slash "${RESULT_SP1[@]}")"
+        echo "sp1_cycles=$(join_slash "${RESULT_SP1_CYCLES[@]}")"
+        echo "ratios=$(join_slash "${RESULT_RATIO[@]}")"
+        if [ -n "$LAMBDA_PROJECTED_S" ]; then
+            echo "lambda_slope_s_per_1m=$LAMBDA_SLOPE"
+            echo "lambda_intercept_s=$LAMBDA_INTERCEPT"
+            echo "lambda_r2=$LAMBDA_R2"
+            echo "lambda_projected_time_s=$LAMBDA_PROJECTED_S"
+            echo "lambda_projected_time_h=$LAMBDA_PROJECTED_H"
+        fi
+        if [ -n "$SP1_PROJECTED_S" ]; then
+            echo "sp1_slope_s_per_1m=$SP1_SLOPE"
+            echo "sp1_intercept_s=$SP1_INTERCEPT"
+            echo "sp1_r2=$SP1_R2"
+            echo "sp1_projected_time_s=$SP1_PROJECTED_S"
+            echo "sp1_projected_time_h=$SP1_PROJECTED_H"
+        fi
+    } > "$REPORT_DIR/metrics.txt"
+
+    {
+        echo "# Lambda VM vs SP1 v6 Benchmark"
+        echo
+        echo "| n | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |"
+        echo "|--:|--------------:|-----------:|-----------:|------:|"
+        for i in "${!RESULT_N[@]}"; do
+            printf "| %s | %s | %s | %s | %s |\n" \
+                "${RESULT_N[$i]}" \
+                "${RESULT_LAMBDA[$i]}" \
+                "${RESULT_SP1[$i]}" \
+                "${RESULT_SP1_CYCLES[$i]}" \
+                "${RESULT_RATIO[$i]}"
+        done
+        echo
+        echo "## Linear Projection to 500M Steps"
+        echo
+        echo "| Prover | Slope (s / 1M steps) | Intercept (s) | R² | Projected @ 500M (s) | Projected @ 500M (h) |"
+        echo "|--------|----------------------:|--------------:|---:|---------------------:|---------------------:|"
+        if [ -n "$LAMBDA_PROJECTED_S" ]; then
+            printf "| Lambda VM | %s | %s | %s | %s | %s |\n" \
+                "$LAMBDA_SLOPE" \
+                "$LAMBDA_INTERCEPT" \
+                "$LAMBDA_R2" \
+                "$LAMBDA_PROJECTED_S" \
+                "$LAMBDA_PROJECTED_H"
+        fi
+        if [ -n "$SP1_PROJECTED_S" ]; then
+            printf "| SP1 v6 | %s | %s | %s | %s | %s |\n" \
+                "$SP1_SLOPE" \
+                "$SP1_INTERCEPT" \
+                "$SP1_R2" \
+                "$SP1_PROJECTED_S" \
+                "$SP1_PROJECTED_H"
+        fi
+    } > "$REPORT_DIR/summary.md"
+fi
diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs
index 725f0de5f..3a1917a32 100644
--- a/bin/cli/src/main.rs
+++ b/bin/cli/src/main.rs
@@ -101,6 +101,10 @@ enum Commands {
         #[arg(value_parser, value_hint = ValueHint::FilePath)]
         elf: PathBuf,
 
+        /// Path to the private input file
+        #[arg(long, value_hint = ValueHint::FilePath)]
+        private_input: Option<PathBuf>,
+
         /// Generate flamegraph folded stacks to file
         #[arg(long, value_hint = ValueHint::FilePath)]
         flamegraph: Option<PathBuf>,
@@ -116,6 +120,10 @@ enum Commands {
         #[arg(short, long, value_hint = ValueHint::FilePath)]
         output: PathBuf,
 
+        /// Path to the private input file
+        #[arg(long, value_hint = ValueHint::FilePath)]
+        private_input: Option<PathBuf>,
+
         /// Blowup factor (power of 2). Higher = fewer queries, smaller proof, slower proving.
         #[arg(long)]
         blowup: Option<u8>,
@@ -149,13 +157,18 @@ fn main() -> ExitCode {
     let cli = Cli::parse();
 
     match cli.command {
-        Commands::Execute { elf, flamegraph } => cmd_execute(elf, flamegraph),
+        Commands::Execute {
+            elf,
+            private_input,
+            flamegraph,
+        } => cmd_execute(elf, private_input, flamegraph),
         Commands::Prove {
             elf,
             output,
+            private_input,
             blowup,
             time,
-        } => cmd_prove(elf, output, blowup, time),
+        } => cmd_prove(elf, output, private_input, blowup, time),
         Commands::Verify {
             proof,
             elf,
@@ -165,7 +178,21 @@ fn main() -> ExitCode {
     }
 }
 
-fn cmd_execute(elf_path: PathBuf, flamegraph_path: Option<PathBuf>) -> ExitCode {
+fn read_private_input(path: Option<&PathBuf>) -> Result<Vec<u8>, String> {
+    match path {
+        Some(path) => {
+            eprintln!("Reading private input file...");
+            std::fs::read(path).map_err(|e| format!("Failed to read private input file: {e}"))
+        }
+        None => Ok(vec![]),
+    }
+}
+
+fn cmd_execute(
+    elf_path: PathBuf,
+    private_input_path: Option<PathBuf>,
+    flamegraph_path: Option<PathBuf>,
+) -> ExitCode {
     let elf_data = match std::fs::read(&elf_path) {
         Ok(data) => data,
         Err(e) => {
@@ -182,7 +209,15 @@ fn cmd_execute(elf_path: PathBuf, flamegraph_path: Option<PathBuf>) -> ExitCode
         }
     };
 
-    let mut executor = match Executor::new(&program, vec![]) {
+    let private_inputs = match read_private_input(private_input_path.as_ref()) {
+        Ok(inputs) => inputs,
+        Err(e) => {
+            eprintln!("{e}");
+            return ExitCode::FAILURE;
+        }
+    };
+
+    let mut executor = match Executor::new(&program, private_inputs) {
         Ok(e) => e,
         Err(e) => {
             eprintln!("Failed to create executor: {:?}", e);
@@ -249,7 +284,13 @@ fn cmd_execute(elf_path: PathBuf, flamegraph_path: Option<PathBuf>) -> ExitCode
     ExitCode::SUCCESS
 }
 
-fn cmd_prove(elf_path: PathBuf, output_path: PathBuf, blowup: Option<u8>, time: bool) -> ExitCode {
+fn cmd_prove(
+    elf_path: PathBuf,
+    output_path: PathBuf,
+    private_input_path: Option<PathBuf>,
+    blowup: Option<u8>,
+    time: bool,
+) -> ExitCode {
     eprintln!("Reading ELF file...");
     let elf_data = match std::fs::read(&elf_path) {
         Ok(data) => data,
@@ -259,6 +300,14 @@ fn cmd_prove(elf_path: PathBuf, output_path: PathBuf, blowup: Option<u8>, time:
         }
     };
 
+    let private_inputs = match read_private_input(private_input_path.as_ref()) {
+        Ok(inputs) => inputs,
+        Err(e) => {
+            eprintln!("{e}");
+            return ExitCode::FAILURE;
+        }
+    };
+
     #[cfg(feature = "jemalloc-stats")]
     let tracker = heap_tracker::HeapTracker::start();
 
@@ -276,11 +325,16 @@ fn cmd_prove(elf_path: PathBuf, output_path: PathBuf, blowup: Option<u8>, time:
                 "Generating proof (blowup={b}, queries={})...",
                 opts.fri_number_of_queries
             );
-            prover::prove_with_options(&elf_data, &opts, &Default::default())
+            prover::prove_with_options_and_inputs(
+                &elf_data,
+                &private_inputs,
+                &opts,
+                &Default::default(),
+            )
         }
         None => {
             eprintln!("Generating proof...");
-            prover::prove(&elf_data)
+            prover::prove_with_inputs(&elf_data, &private_inputs)
         }
     };
     let prove_elapsed = start.elapsed();
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index 6cfab6ea3..41089a9e5 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -1,8 +1,6 @@
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug;
 pub mod constraints;
-#[cfg(feature = "instruments")]
-pub mod instruments;
 pub mod context;
 pub mod debug;
 pub mod domain;
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 21bfc9255..a4eb6cd7a 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -461,8 +461,14 @@ pub(crate) fn compute_expected_commit_bus_balance(
 
 /// Prove an ELF binary execution. Returns a serializable proof bundle.
 pub fn prove(elf_bytes: &[u8]) -> Result<VmProof, Error> {
-    prove_with_options(
+    prove_with_inputs(elf_bytes, &[])
+}
+
+/// Prove an ELF binary execution with private inputs. Returns a serializable proof bundle.
+pub fn prove_with_inputs(elf_bytes: &[u8], private_inputs: &[u8]) -> Result<VmProof, Error> {
+    prove_with_options_and_inputs(
         elf_bytes,
+        private_inputs,
         &GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 is always valid"),
         &MaxRowsConfig::default(),
     )
@@ -473,6 +479,17 @@ pub fn prove_with_options(
     elf_bytes: &[u8],
     proof_options: &ProofOptions,
     max_rows: &MaxRowsConfig,
+) -> Result<VmProof, Error> {
+    prove_with_options_and_inputs(elf_bytes, &[], proof_options, max_rows)
+}
+
+/// Prove an ELF binary execution with custom proof options, max rows config,
+/// and explicit private inputs.
+pub fn prove_with_options_and_inputs(
+    elf_bytes: &[u8],
+    private_inputs: &[u8],
+    proof_options: &ProofOptions,
+    max_rows: &MaxRowsConfig,
 ) -> Result<VmProof, Error> {
     #[cfg(feature = "instruments")]
     let total_start = std::time::Instant::now();
@@ -482,7 +499,8 @@ pub fn prove_with_options(
     let phase_start = std::time::Instant::now();
 
     let program = Elf::load(elf_bytes).map_err(|e| Error::ElfLoad(format!("{e}")))?;
-    let executor = Executor::new(&program, vec![]).map_err(|e| Error::Execution(format!("{e}")))?;
+    let executor = Executor::new(&program, private_inputs.to_vec())
+        .map_err(|e| Error::Execution(format!("{e}")))?;
     let result = executor
         .run()
         .map_err(|e| Error::Execution(format!("{e}")))?;

From 0315a80dbd6c5678d761b79cf11485dbc2c0e4e0 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Mon, 6 Apr 2026 11:36:46 -0300
Subject: [PATCH 11/34] save work

---
 .github/workflows/bench-vs-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml
index 50e88bb63..eb063c5ae 100644
--- a/.github/workflows/bench-vs-nightly.yml
+++ b/.github/workflows/bench-vs-nightly.yml
@@ -43,7 +43,7 @@ jobs:
       - name: Run nightly benchmark
         run: |
           bash ./bench_vs/run.sh \
-            -n 1000000 2000000 4000000 8000000 \
+            -n 500000 1000000 1500000 2000000 \
             --report-dir bench_vs_artifacts \
             --no-color
 

From 3f3c7aca2634f9bd85b2dc397d08a6b03ab672ef Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 7 Apr 2026 15:47:57 -0300
Subject: [PATCH 12/34]  Fix bench_vs 5x projection inflation and add   --steps
 flag for nightly 1M/2M/4M/8M benchmarks

---
 .github/workflows/bench-vs-nightly.yml |   4 +-
 bench_vs/README.md                     |  18 ++-
 bench_vs/run.sh                        | 210 +++++++++++++++++++------
 3 files changed, 175 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml
index eb063c5ae..68f8d5fac 100644
--- a/.github/workflows/bench-vs-nightly.yml
+++ b/.github/workflows/bench-vs-nightly.yml
@@ -43,14 +43,14 @@ jobs:
       - name: Run nightly benchmark
         run: |
           bash ./bench_vs/run.sh \
-            -n 500000 1000000 1500000 2000000 \
+            --steps 1000000 2000000 4000000 8000000 \
             --report-dir bench_vs_artifacts \
             --no-color
 
       - name: Upload nightly benchmark artifact
         uses: actions/upload-artifact@v4
         with:
-          name: bench-vs-nightly-${{ github.sha }}
+          name: bench-vs-nightly-${{ github.run_number }}-${{ github.sha }}
           path: bench_vs_artifacts
           retention-days: 90
 
diff --git a/bench_vs/README.md b/bench_vs/README.md
index 1be30c5d2..1e8a8d9f3 100644
--- a/bench_vs/README.md
+++ b/bench_vs/README.md
@@ -29,6 +29,9 @@ Compares proving time for an identical u64 wrapping Fibonacci computation.
 # Custom series
 ./bench_vs/run.sh -n 1000 50000
 
+# Approximate workload steps (converted with 5 steps/iteration)
+./bench_vs/run.sh --steps 1000000 2000000 4000000 8000000
+
 # Run only one prover
 ./bench_vs/run.sh --lambda-only
 ./bench_vs/run.sh --sp1-only
@@ -42,18 +45,21 @@ Only **proving time** is compared (wall-clock, no recursion/compression on eithe
 - **Lambda VM**: Generates RISC-V assembly at runtime, assembles to ELF, proves via the CLI.
 - **SP1 v6**: Compiles a Rust guest program to RISC-V, proves via `sp1-sdk` core mode.
 
+The linear projection uses a common axis for both provers: target workload steps.
+When you pass `--steps`, that target is explicit. When you pass `-n`, the script
+approximates workload as `steps ~= 5 * n`. `SP1 cycles` are still reported, but
+only as telemetry and not as the regression axis.
+
 ## Output
 
 ```
 === Summary ===
 Program: Fibonacci (u64 wrapping)
 
-  n           Lambda VM       SP1 v6     Ratio
-  ---         ---------       ------     -----
-  1000          13.3s         12.4s      0.9x
-  10000         22.4s         12.9s      0.6x
-  100000       116.4s         14.7s      0.1x
-  300000          ...           ...       ...
+  Target steps  Iterations     Lambda VM        SP1 v6    SP1 cycles     Ratio
+  ------------  ----------     ---------        ------    ----------     -----
+  1000000       200000            ...s           ...s       1004794       ...
+  2000000       400000            ...s           ...s       2004794       ...
 
 Green ratio = Lambda VM faster, Red = SP1 faster
 ```
diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index 5592e095c..7e3b06c23 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -1,10 +1,13 @@
 #!/bin/bash
 # Benchmark: Lambda VM vs SP1 v6 — Fibonacci proving time comparison.
 #
-# Usage: ./bench_vs/run.sh [-n 1000 50000 100000] [--lambda-only | --sp1-only]
-#                         [--report-dir DIR] [--no-color]
+# Usage: ./bench_vs/run.sh [-n 1000 50000 100000 | --steps 1000000 2000000]
+#                         [--lambda-only | --sp1-only] [--report-dir DIR]
+#                         [--target-steps N] [--no-color]
 #
-# Without -n, runs the default series: 1000 10000 100000 300000
+# Without an explicit series, defaults to:
+#   - iterations mode: 1000 10000 100000 300000
+#   - steps mode: 1000000 2000000 4000000 8000000
 #
 # Prerequisites:
 #   - Lambda VM CLI build dependencies available
@@ -18,7 +21,8 @@ ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
 TMP_DIR="/tmp/bench_fib"
 REPORT_DIR=""
 NO_COLOR=false
-TARGET_STEPS=500000000
+TARGET_STEPS="${TARGET_STEPS:-500000000}"
+APPROX_STEPS_PER_ITERATION=5
 
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -27,8 +31,10 @@ BOLD='\033[1m'
 NC='\033[0m'
 
 # --- Defaults ---------------------------------------------------------------
-DEFAULT_SERIES=(1000 10000 100000 300000)
+DEFAULT_ITERATION_SERIES=(1000 10000 100000 300000)
+DEFAULT_STEP_SERIES=(1000000 2000000 4000000 8000000)
 SERIES=()
+SERIES_MODE=""
 RUN_LAMBDA=true
 RUN_SP1=true
 
@@ -36,6 +42,23 @@ RUN_SP1=true
 while [[ $# -gt 0 ]]; do
     case $1 in
         -n)
+            if [ -n "$SERIES_MODE" ] && [ "$SERIES_MODE" != "iterations" ]; then
+                echo "Cannot mix -n with --steps"
+                exit 1
+            fi
+            SERIES_MODE="iterations"
+            shift
+            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
+                SERIES+=("$1")
+                shift
+            done
+            ;;
+        --steps)
+            if [ -n "$SERIES_MODE" ] && [ "$SERIES_MODE" != "steps" ]; then
+                echo "Cannot mix --steps with -n"
+                exit 1
+            fi
+            SERIES_MODE="steps"
             shift
             while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                 SERIES+=("$1")
@@ -51,21 +74,30 @@ while [[ $# -gt 0 ]]; do
             shift
             ;;
         --report-dir)
+            if [[ $# -lt 2 ]]; then echo "--report-dir requires an argument"; exit 1; fi
             REPORT_DIR=$2
             shift 2
             ;;
+        --target-steps)
+            if [[ $# -lt 2 ]]; then echo "--target-steps requires an argument"; exit 1; fi
+            TARGET_STEPS=$2
+            shift 2
+            ;;
         --no-color)
             NO_COLOR=true
             shift
             ;;
         -h|--help)
-            echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--no-color]"
+            echo "Usage: $0 [-n N1 N2 ... | --steps S1 S2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--target-steps N] [--no-color]"
             echo ""
             echo "  -n N1 N2 ...      Fibonacci iteration counts (space-separated)"
-            echo "                    Default series: ${DEFAULT_SERIES[*]}"
+            echo "                    Default iteration series: ${DEFAULT_ITERATION_SERIES[*]}"
+            echo "  --steps S1 S2 ... Approximate workload steps; converted via ${APPROX_STEPS_PER_ITERATION} steps/iteration"
+            echo "                    Default step series: ${DEFAULT_STEP_SERIES[*]}"
             echo "  --lambda-only     Only run Lambda VM benchmark"
             echo "  --sp1-only        Only run SP1 benchmark"
             echo "  --report-dir DIR  Write TSV, metrics, markdown summary, and raw outputs"
+            echo "  --target-steps N  Projection target in workload steps (default: $TARGET_STEPS)"
             echo "  --no-color        Disable ANSI colors"
             exit 0
             ;;
@@ -73,11 +105,19 @@ while [[ $# -gt 0 ]]; do
             echo "Unknown option: $1"
             exit 1
             ;;
-    esac
+        esac
 done
 
+if [ -z "$SERIES_MODE" ]; then
+    SERIES_MODE="iterations"
+fi
+
 if [ ${#SERIES[@]} -eq 0 ]; then
-    SERIES=("${DEFAULT_SERIES[@]}")
+    if [ "$SERIES_MODE" = "steps" ]; then
+        SERIES=("${DEFAULT_STEP_SERIES[@]}")
+    else
+        SERIES=("${DEFAULT_ITERATION_SERIES[@]}")
+    fi
 fi
 
 if ! $RUN_LAMBDA && ! $RUN_SP1; then
@@ -109,6 +149,20 @@ join_slash() {
     printf "%s\n" "$joined"
 }
 
+approx_steps_for_iterations() {
+    local iterations=$1
+    awk -v iterations="$iterations" -v ratio="$APPROX_STEPS_PER_ITERATION" 'BEGIN {
+        printf "%.0f\n", iterations * ratio
+    }'
+}
+
+approx_iterations_for_steps() {
+    local steps=$1
+    awk -v steps="$steps" -v ratio="$APPROX_STEPS_PER_ITERATION" 'BEGIN {
+        printf "%.0f\n", steps / ratio
+    }'
+}
+
 fit_series() {
     local steps_slash=$1
     local values_slash=$2
@@ -197,7 +251,9 @@ PY
 }
 
 echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}"
-echo -e "Series: ${YELLOW}${SERIES[*]}${NC}"
+echo -e "Series mode: ${YELLOW}${SERIES_MODE}${NC}"
+echo -e "Requested series: ${YELLOW}${SERIES[*]}${NC}"
+echo -e "Projection target: ${YELLOW}${TARGET_STEPS}${NC} workload steps"
 echo ""
 
 # --- Pre-build --------------------------------------------------------------
@@ -242,30 +298,60 @@ fi
 
 # --- Run benchmark series ---------------------------------------------------
 
-RESULT_N=()
+RUN_ITERATIONS=()
+RUN_TARGET_STEPS=()
+for value in "${SERIES[@]}"; do
+    if [ "$SERIES_MODE" = "steps" ]; then
+        target_steps=$value
+        iterations=$(approx_iterations_for_steps "$target_steps")
+    else
+        iterations=$value
+        target_steps=$(approx_steps_for_iterations "$iterations")
+    fi
+
+    if [ "$iterations" -le 0 ]; then
+        echo "Invalid series value: $value"
+        exit 1
+    fi
+
+    RUN_ITERATIONS+=("$iterations")
+    RUN_TARGET_STEPS+=("$target_steps")
+done
+
+if [ "$SERIES_MODE" = "steps" ]; then
+    echo -e "Iterations used: ${YELLOW}${RUN_ITERATIONS[*]}${NC}"
+    echo ""
+fi
+
+RESULT_TARGET_STEPS=()
+RESULT_ITERATIONS=()
+RESULT_PROJECTION_STEPS=()
 RESULT_LAMBDA=()
 RESULT_SP1=()
 RESULT_SP1_CYCLES=()
 RESULT_RATIO=()
 
-LAMBDA_STEPS=()
+LAMBDA_PROJECTION_STEPS=()
 LAMBDA_TIMES=()
-SP1_STEPS=()
+SP1_PROJECTION_STEPS=()
 SP1_TIMES=()
+PROJECTION_AXIS="target_workload_steps"
 
 if [ -n "$REPORT_DIR" ]; then
-    printf "n\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv"
+    printf "target_steps\titerations\tprojection_steps\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv"
 fi
 
 run_one() {
     local n=$1
+    local target_steps=$2
     local lambda_time="n/a"
     local sp1_time="n/a"
     local sp1_cycles="n/a"
+    local projection_steps=$target_steps
     local ratio="n/a"
 
     echo ""
-    echo -e "${BOLD}--- n=${n} ---${NC}"
+    echo -e "${BOLD}--- target≈${target_steps} steps (n=${n} iterations) ---${NC}"
 
     if $RUN_LAMBDA; then
         local input_file="$TMP_DIR/lambda_${n}.bin"
@@ -290,8 +376,6 @@ run_one() {
         fi
 
         echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC}"
-        LAMBDA_STEPS+=("$n")
-        LAMBDA_TIMES+=("$lambda_time")
 
         if [ -n "$REPORT_DIR" ]; then
             printf "%s\n" "$lambda_output" > "$REPORT_DIR/raw/lambda_${n}.stdout"
@@ -317,8 +401,6 @@ run_one() {
         fi
 
         echo -e "  SP1 v6:    ${BOLD}${sp1_time}s${NC} (${sp1_cycles} cycles)"
-        SP1_STEPS+=("$n")
-        SP1_TIMES+=("$sp1_time")
 
         if [ -n "$REPORT_DIR" ]; then
             cp "$sp1_output_file" "$REPORT_DIR/raw/sp1_${n}.stdout"
@@ -329,19 +411,37 @@ run_one() {
         ratio=$(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { printf "%.3f", lambda / sp1 }')
     fi
 
-    RESULT_N+=("$n")
+    if [ "$lambda_time" != "n/a" ]; then
+        LAMBDA_PROJECTION_STEPS+=("$target_steps")
+        LAMBDA_TIMES+=("$lambda_time")
+    fi
+    if [ "$sp1_time" != "n/a" ]; then
+        SP1_PROJECTION_STEPS+=("$target_steps")
+        SP1_TIMES+=("$sp1_time")
+    fi
+
+    RESULT_TARGET_STEPS+=("$target_steps")
+    RESULT_ITERATIONS+=("$n")
+    RESULT_PROJECTION_STEPS+=("$projection_steps")
     RESULT_LAMBDA+=("$lambda_time")
     RESULT_SP1+=("$sp1_time")
     RESULT_SP1_CYCLES+=("$sp1_cycles")
     RESULT_RATIO+=("$ratio")
 
     if [ -n "$REPORT_DIR" ]; then
-        printf "%s\t%s\t%s\t%s\t%s\n" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" "$ratio" >> "$REPORT_DIR/results.tsv"
+        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+            "$target_steps" \
+            "$n" \
+            "$projection_steps" \
+            "$lambda_time" \
+            "$sp1_time" \
+            "$sp1_cycles" \
+            "$ratio" >> "$REPORT_DIR/results.tsv"
     fi
 }
 
-for n in "${SERIES[@]}"; do
-    run_one "$n"
+for i in "${!RUN_ITERATIONS[@]}"; do
+    run_one "${RUN_ITERATIONS[$i]}" "${RUN_TARGET_STEPS[$i]}"
 done
 
 # --- Projection -------------------------------------------------------------
@@ -390,11 +490,11 @@ compute_projection() {
     esac
 }
 
-if $RUN_LAMBDA && [ ${#LAMBDA_STEPS[@]} -gt 0 ]; then
-    compute_projection "lambda" "$(join_slash "${LAMBDA_STEPS[@]}")" "$(join_slash "${LAMBDA_TIMES[@]}")"
+if $RUN_LAMBDA && [ ${#LAMBDA_PROJECTION_STEPS[@]} -gt 0 ]; then
+    compute_projection "lambda" "$(join_slash "${LAMBDA_PROJECTION_STEPS[@]}")" "$(join_slash "${LAMBDA_TIMES[@]}")"
 fi
-if $RUN_SP1 && [ ${#SP1_STEPS[@]} -gt 0 ]; then
-    compute_projection "sp1" "$(join_slash "${SP1_STEPS[@]}")" "$(join_slash "${SP1_TIMES[@]}")"
+if $RUN_SP1 && [ ${#SP1_PROJECTION_STEPS[@]} -gt 0 ]; then
+    compute_projection "sp1" "$(join_slash "${SP1_PROJECTION_STEPS[@]}")" "$(join_slash "${SP1_TIMES[@]}")"
 fi
 
 # --- Summary table ----------------------------------------------------------
@@ -405,18 +505,19 @@ echo -e "Program: Fibonacci (u64 wrapping)"
 echo ""
 
 if $RUN_LAMBDA && $RUN_SP1; then
-    printf "  %-10s  %12s  %12s  %12s  %8s\n" "n" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio"
-    printf "  %-10s  %12s  %12s  %12s  %8s\n" "---" "---------" "------" "----------" "-----"
+    printf "  %-12s  %-12s  %12s  %12s  %12s  %8s\n" "Target steps" "Iterations" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio"
+    printf "  %-12s  %-12s  %12s  %12s  %12s  %8s\n" "------------" "----------" "---------" "------" "----------" "-----"
 elif $RUN_LAMBDA; then
-    printf "  %-10s  %12s\n" "n" "Lambda VM"
-    printf "  %-10s  %12s\n" "---" "---------"
+    printf "  %-12s  %-12s  %12s\n" "Target steps" "Iterations" "Lambda VM"
+    printf "  %-12s  %-12s  %12s\n" "------------" "----------" "---------"
 else
-    printf "  %-10s  %12s  %12s\n" "n" "SP1 v6" "SP1 cycles"
-    printf "  %-10s  %12s  %12s\n" "---" "------" "----------"
+    printf "  %-12s  %-12s  %12s  %12s\n" "Target steps" "Iterations" "SP1 v6" "SP1 cycles"
+    printf "  %-12s  %-12s  %12s  %12s\n" "------------" "----------" "------" "----------"
 fi
 
-for i in "${!RESULT_N[@]}"; do
-    n="${RESULT_N[$i]}"
+for i in "${!RESULT_ITERATIONS[@]}"; do
+    target_steps="${RESULT_TARGET_STEPS[$i]}"
+    n="${RESULT_ITERATIONS[$i]}"
     lambda_time="${RESULT_LAMBDA[$i]}"
     sp1_time="${RESULT_SP1[$i]}"
     sp1_cycles="${RESULT_SP1_CYCLES[$i]}"
@@ -430,15 +531,15 @@ for i in "${!RESULT_N[@]}"; do
             else
                 ratio_colored="${GREEN}${ratio_colored}${NC}"
             fi
-            printf "  %-10s  %11ss  %11ss  %12s  " "$n" "$lambda_time" "$sp1_time" "$sp1_cycles"
+            printf "  %-12s  %-12s  %11ss  %11ss  %12s  " "$target_steps" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles"
             echo -e "$ratio_colored"
         else
-            printf "  %-10s  %12s  %12s  %12s  %8s\n" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-"
+            printf "  %-12s  %-12s  %12s  %12s  %12s  %8s\n" "$target_steps" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-"
         fi
     elif $RUN_LAMBDA; then
-        printf "  %-10s  %11ss\n" "$n" "$lambda_time"
+        printf "  %-12s  %-12s  %11ss\n" "$target_steps" "$n" "$lambda_time"
     else
-        printf "  %-10s  %11ss  %12s\n" "$n" "$sp1_time" "$sp1_cycles"
+        printf "  %-12s  %-12s  %11ss  %12s\n" "$target_steps" "$n" "$sp1_time" "$sp1_cycles"
     fi
 done
 
@@ -450,7 +551,9 @@ echo "Raw data in $TMP_DIR/"
 
 if [ -n "$LAMBDA_PROJECTED_S" ] || [ -n "$SP1_PROJECTED_S" ]; then
     echo ""
-    echo -e "${BOLD}=== Linear Projection to 500M Steps ===${NC}"
+    echo -e "${BOLD}=== Linear Projection to ${TARGET_STEPS} Workload Steps ===${NC}"
+    echo "  Axis: target workload steps"
+    echo "  Note: when using iterations input, target steps are approximated as ${APPROX_STEPS_PER_ITERATION} * n"
     if [ -n "$LAMBDA_PROJECTED_S" ]; then
         echo "  Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R²=${LAMBDA_R2}"
     fi
@@ -463,8 +566,13 @@ fi
 
 if [ -n "$REPORT_DIR" ]; then
     {
+        echo "series_mode=$SERIES_MODE"
+        echo "requested_series=$(join_slash "${SERIES[@]}")"
+        echo "target_steps_series=$(join_slash "${RESULT_TARGET_STEPS[@]}")"
+        echo "iterations=$(join_slash "${RESULT_ITERATIONS[@]}")"
+        echo "projection_axis=$PROJECTION_AXIS"
         echo "target_steps=$TARGET_STEPS"
-        echo "series=$(join_slash "${RESULT_N[@]}")"
+        echo "projection_steps=$(join_slash "${RESULT_PROJECTION_STEPS[@]}")"
         echo "lambda_times=$(join_slash "${RESULT_LAMBDA[@]}")"
         echo "sp1_times=$(join_slash "${RESULT_SP1[@]}")"
         echo "sp1_cycles=$(join_slash "${RESULT_SP1_CYCLES[@]}")"
@@ -488,21 +596,25 @@ if [ -n "$REPORT_DIR" ]; then
     {
         echo "# Lambda VM vs SP1 v6 Benchmark"
         echo
-        echo "| n | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |"
-        echo "|--:|--------------:|-----------:|-----------:|------:|"
-        for i in "${!RESULT_N[@]}"; do
-            printf "| %s | %s | %s | %s | %s |\n" \
-                "${RESULT_N[$i]}" \
+        echo "Projection axis: \`$PROJECTION_AXIS\`"
+        echo
+        echo "| Target steps | Iterations | Projection steps | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |"
+        echo "|-------------:|-----------:|-----------------:|--------------:|-----------:|-----------:|------:|"
+        for i in "${!RESULT_ITERATIONS[@]}"; do
+            printf "| %s | %s | %s | %s | %s | %s | %s |\n" \
+                "${RESULT_TARGET_STEPS[$i]}" \
+                "${RESULT_ITERATIONS[$i]}" \
+                "${RESULT_PROJECTION_STEPS[$i]}" \
                 "${RESULT_LAMBDA[$i]}" \
                 "${RESULT_SP1[$i]}" \
                 "${RESULT_SP1_CYCLES[$i]}" \
                 "${RESULT_RATIO[$i]}"
         done
         echo
-        echo "## Linear Projection to 500M Steps"
+        echo "## Linear Projection to ${TARGET_STEPS} Workload Steps"
         echo
-        echo "| Prover | Slope (s / 1M steps) | Intercept (s) | R² | Projected @ 500M (s) | Projected @ 500M (h) |"
-        echo "|--------|----------------------:|--------------:|---:|---------------------:|---------------------:|"
+        echo "| Prover | Slope (s / 1M workload steps) | Intercept (s) | R² | Projected @ ${TARGET_STEPS} (s) | Projected @ ${TARGET_STEPS} (h) |"
+        echo "|--------|-------------------------------:|--------------:|---:|------------------------------:|------------------------------:|"
         if [ -n "$LAMBDA_PROJECTED_S" ]; then
             printf "| Lambda VM | %s | %s | %s | %s | %s |\n" \
                 "$LAMBDA_SLOPE" \

From e9e9fbdebc2666fd44ed9854d27f239fcf973f5a Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 14 Apr 2026 09:54:08 -0300
Subject: [PATCH 13/34]  Switch bench_vs projection to measured cycles and add
 --cycles   CLI flag

---
 bench_vs/README.md                        |  59 +++++++---
 bench_vs/run.sh                           | 129 ++++++++++++++--------
 bench_vs/sp1/fibonacci/script/src/main.rs |  14 ++-
 bin/cli/src/main.rs                       |  42 ++++++-
 4 files changed, 179 insertions(+), 65 deletions(-)

diff --git a/bench_vs/README.md b/bench_vs/README.md
index 1e8a8d9f3..c38daa601 100644
--- a/bench_vs/README.md
+++ b/bench_vs/README.md
@@ -32,23 +32,49 @@ Compares proving time for an identical u64 wrapping Fibonacci computation.
 # Approximate workload steps (converted with 5 steps/iteration)
 ./bench_vs/run.sh --steps 1000000 2000000 4000000 8000000
 
+# Project to a target cycle count
+./bench_vs/run.sh --target-cycles 500000000
+
 # Run only one prover
 ./bench_vs/run.sh --lambda-only
 ./bench_vs/run.sh --sp1-only
 ```
 
-## What it measures
+## What is measured
 
 Both provers execute the same program: iterative Fibonacci with `u64::wrapping_add`.
-Only **proving time** is compared (wall-clock, no recursion/compression on either side).
-
-- **Lambda VM**: Generates RISC-V assembly at runtime, assembles to ELF, proves via the CLI.
-- **SP1 v6**: Compiles a Rust guest program to RISC-V, proves via `sp1-sdk` core mode.
 
-The linear projection uses a common axis for both provers: target workload steps.
-When you pass `--steps`, that target is explicit. When you pass `-n`, the script
-approximates workload as `steps ~= 5 * n`. `SP1 cycles` are still reported, but
-only as telemetry and not as the regression axis.
+The timing window on both sides is **end-to-end single-shot proving, with no
+verification and no recursion/compression**. Concretely:
+
+| Phase                                      | Lambda VM timer | SP1 v6 timer |
+|--------------------------------------------|:---------------:|:------------:|
+| Read ELF + input from disk                 |        ❌       |       ❌     |
+| Pre-pass execution to count cycles         |        ❌       |       ❌     |
+| `setup` / verifying-key derivation         |  N/A (none)     |       ✅     |
+| ELF parse + guest execution (inside prove) |        ✅       |       ✅     |
+| Trace build                                |        ✅       |       ✅     |
+| AIR construction                           |        ✅       |       ✅     |
+| STARK prove (`core` mode)                  |        ✅       |       ✅     |
+| Proof serialization / write                |        ❌       |       ❌     |
+| Verify                                     |        ❌       |       ❌     |
+
+Both sides run one extra execution pass **outside** the timer to report dynamic
+instruction counts (SP1's `execute(...)` / Lambda's executor pre-pass). This
+costs wall-clock time in the CI job but does not inflate the measured proving
+time, and the cost is symmetric between the two provers.
+
+Lambda VM uses the default proof options from `prover::prove_with_inputs`
+(`GoldilocksCubicProofOptions::with_blowup(2)`, 50 FRI queries). SP1 v6 uses
+the `core` proof mode exposed by `sp1-sdk::ProverClient::from_env()`.
+
+## Projection axis
+
+The linear projection uses **measured cycles** per prover — Lambda's executor
+log count and SP1's `report.total_instruction_count()`. For Fibonacci the two
+values agree to within ~1% (both compile to the same inner loop shape on
+RISC-V). When cycle data is missing, the script falls back to the approximate
+`target_workload_steps ~= 5 * n` label that was passed on the command line.
 
 ## Output
 
@@ -56,10 +82,17 @@ only as telemetry and not as the regression axis.
 === Summary ===
 Program: Fibonacci (u64 wrapping)
 
-  Target steps  Iterations     Lambda VM        SP1 v6    SP1 cycles     Ratio
-  ------------  ----------     ---------        ------    ----------     -----
-  1000000       200000            ...s           ...s       1004794       ...
-  2000000       400000            ...s           ...s       2004794       ...
+  Target steps  Iterations      Lambda (s)   Lambda cycles         SP1 (s)      SP1 cycles     Ratio
+  ------------  ----------      ----------   -------------         -------      ----------     -----
+  1000000       200000              ...s         1004794             ...s         1004794       ...
+  2000000       400000              ...s         2004794             ...s         2004794       ...
 
+Timing window covers single-shot end-to-end proving; SP1 includes setup; both exclude verification.
 Green ratio = Lambda VM faster, Red = SP1 faster
 ```
+
+With `--report-dir DIR` the script writes:
+- `results.tsv` — raw per-run data (`target_steps`, `iterations`, `lambda_time_s`, `lambda_axis_value`, `lambda_cycles`, `sp1_time_s`, `sp1_axis_value`, `sp1_cycles`, `ratio`).
+- `metrics.txt` — key=value pairs including `timing_window=setup_plus_end_to_end_prove_no_verify`.
+- `summary.md` — the same table plus linear projection to `TARGET_CYCLES` cycles.
+- `raw/` — stdout/stderr of every individual run.
diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index 7e3b06c23..f72f3731e 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -3,7 +3,7 @@
 #
 # Usage: ./bench_vs/run.sh [-n 1000 50000 100000 | --steps 1000000 2000000]
 #                         [--lambda-only | --sp1-only] [--report-dir DIR]
-#                         [--target-steps N] [--no-color]
+#                         [--target-cycles N] [--no-color]
 #
 # Without an explicit series, defaults to:
 #   - iterations mode: 1000 10000 100000 300000
@@ -21,7 +21,7 @@ ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
 TMP_DIR="/tmp/bench_fib"
 REPORT_DIR=""
 NO_COLOR=false
-TARGET_STEPS="${TARGET_STEPS:-500000000}"
+TARGET_CYCLES="${TARGET_CYCLES:-${TARGET_STEPS:-500000000}}"
 APPROX_STEPS_PER_ITERATION=5
 
 RED='\033[0;31m'
@@ -78,9 +78,15 @@ while [[ $# -gt 0 ]]; do
             REPORT_DIR=$2
             shift 2
             ;;
+        --target-cycles)
+            if [[ $# -lt 2 ]]; then echo "--target-cycles requires an argument"; exit 1; fi
+            TARGET_CYCLES=$2
+            shift 2
+            ;;
         --target-steps)
             if [[ $# -lt 2 ]]; then echo "--target-steps requires an argument"; exit 1; fi
-            TARGET_STEPS=$2
+            TARGET_CYCLES=$2
+            echo "Warning: --target-steps is deprecated; use --target-cycles" >&2
             shift 2
             ;;
         --no-color)
@@ -88,7 +94,7 @@ while [[ $# -gt 0 ]]; do
             shift
             ;;
         -h|--help)
-            echo "Usage: $0 [-n N1 N2 ... | --steps S1 S2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--target-steps N] [--no-color]"
+            echo "Usage: $0 [-n N1 N2 ... | --steps S1 S2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--target-cycles N] [--no-color]"
             echo ""
             echo "  -n N1 N2 ...      Fibonacci iteration counts (space-separated)"
             echo "                    Default iteration series: ${DEFAULT_ITERATION_SERIES[*]}"
@@ -96,8 +102,9 @@ while [[ $# -gt 0 ]]; do
             echo "                    Default step series: ${DEFAULT_STEP_SERIES[*]}"
             echo "  --lambda-only     Only run Lambda VM benchmark"
             echo "  --sp1-only        Only run SP1 benchmark"
-            echo "  --report-dir DIR  Write TSV, metrics, markdown summary, and raw outputs"
-            echo "  --target-steps N  Projection target in workload steps (default: $TARGET_STEPS)"
+            echo "  --report-dir DIR   Write TSV, metrics, markdown summary, and raw outputs"
+            echo "  --target-cycles N  Projection target in cycles (default: $TARGET_CYCLES)"
+            echo "  --target-steps N   Deprecated alias for --target-cycles"
             echo "  --no-color        Disable ANSI colors"
             exit 0
             ;;
@@ -253,7 +260,7 @@ PY
 echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}"
 echo -e "Series mode: ${YELLOW}${SERIES_MODE}${NC}"
 echo -e "Requested series: ${YELLOW}${SERIES[*]}${NC}"
-echo -e "Projection target: ${YELLOW}${TARGET_STEPS}${NC} workload steps"
+echo -e "Projection target: ${YELLOW}${TARGET_CYCLES}${NC} cycles"
 echo ""
 
 # --- Pre-build --------------------------------------------------------------
@@ -325,9 +332,11 @@ fi
 
 RESULT_TARGET_STEPS=()
 RESULT_ITERATIONS=()
-RESULT_PROJECTION_STEPS=()
 RESULT_LAMBDA=()
+RESULT_LAMBDA_AXIS=()
+RESULT_LAMBDA_CYCLES=()
 RESULT_SP1=()
+RESULT_SP1_AXIS=()
 RESULT_SP1_CYCLES=()
 RESULT_RATIO=()
 
@@ -335,23 +344,25 @@ LAMBDA_PROJECTION_STEPS=()
 LAMBDA_TIMES=()
 SP1_PROJECTION_STEPS=()
 SP1_TIMES=()
-PROJECTION_AXIS="target_workload_steps"
+# Axis: use measured dynamic instruction counts per prover. If cycle data is
+# unavailable for a run, fall back to the approximated target_workload_steps.
+PROJECTION_AXIS="measured_cycles"
 
 if [ -n "$REPORT_DIR" ]; then
-    printf "target_steps\titerations\tprojection_steps\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv"
+    printf "target_steps\titerations\tlambda_time_s\tlambda_axis_value\tlambda_cycles\tsp1_time_s\tsp1_axis_value\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv"
 fi
 
 run_one() {
     local n=$1
     local target_steps=$2
     local lambda_time="n/a"
+    local lambda_cycles="n/a"
     local sp1_time="n/a"
     local sp1_cycles="n/a"
-    local projection_steps=$target_steps
     local ratio="n/a"
 
     echo ""
-    echo -e "${BOLD}--- target≈${target_steps} steps (n=${n} iterations) ---${NC}"
+    echo -e "${BOLD}--- target~=${target_steps} steps (n=${n} iterations) ---${NC}"
 
     if $RUN_LAMBDA; then
         local input_file="$TMP_DIR/lambda_${n}.bin"
@@ -361,7 +372,7 @@ run_one() {
 
         echo -e "  ${GREEN}[Lambda VM] Proving...${NC}"
         local lambda_output
-        if ! lambda_output=$("$CLI" prove "$LAMBDA_ELF" -o "$proof_file" --private-input "$input_file" --time 2>"$stderr_file"); then
+        if ! lambda_output=$("$CLI" prove "$LAMBDA_ELF" -o "$proof_file" --private-input "$input_file" --time --cycles 2>"$stderr_file"); then
             echo -e "  ${RED}[Lambda VM] FAILED:${NC}"
             cat "$stderr_file"
             exit 1
@@ -369,13 +380,21 @@ run_one() {
         rm -f "$proof_file"
 
         lambda_time=$(echo "$lambda_output" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
+        lambda_cycles=$(echo "$lambda_output" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*')
         if [ -z "$lambda_time" ]; then
             echo -e "  ${RED}[Lambda VM] FAILED: could not parse proving time${NC}"
             printf "%s\n" "$lambda_output"
             exit 1
         fi
+        if [ -z "$lambda_cycles" ]; then
+            lambda_cycles="n/a"
+        fi
 
-        echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC}"
+        if [ "$lambda_cycles" != "n/a" ]; then
+            echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC} (${lambda_cycles} cycles)"
+        else
+            echo -e "  Lambda VM: ${BOLD}${lambda_time}s${NC}"
+        fi
 
         if [ -n "$REPORT_DIR" ]; then
             printf "%s\n" "$lambda_output" > "$REPORT_DIR/raw/lambda_${n}.stdout"
@@ -411,30 +430,45 @@ run_one() {
         ratio=$(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { printf "%.3f", lambda / sp1 }')
     fi
 
+    # Axis selection per prover: use measured cycles when available, otherwise
+    # fall back to the approximated target_steps.
+    local lambda_axis="$target_steps"
+    if [ "$lambda_cycles" != "n/a" ]; then
+        lambda_axis="$lambda_cycles"
+    fi
+    local sp1_axis="$target_steps"
+    if [ "$sp1_cycles" != "n/a" ]; then
+        sp1_axis="$sp1_cycles"
+    fi
+
     if [ "$lambda_time" != "n/a" ]; then
-        LAMBDA_PROJECTION_STEPS+=("$target_steps")
+        LAMBDA_PROJECTION_STEPS+=("$lambda_axis")
         LAMBDA_TIMES+=("$lambda_time")
     fi
     if [ "$sp1_time" != "n/a" ]; then
-        SP1_PROJECTION_STEPS+=("$target_steps")
+        SP1_PROJECTION_STEPS+=("$sp1_axis")
         SP1_TIMES+=("$sp1_time")
     fi
 
     RESULT_TARGET_STEPS+=("$target_steps")
     RESULT_ITERATIONS+=("$n")
-    RESULT_PROJECTION_STEPS+=("$projection_steps")
     RESULT_LAMBDA+=("$lambda_time")
+    RESULT_LAMBDA_AXIS+=("$lambda_axis")
+    RESULT_LAMBDA_CYCLES+=("$lambda_cycles")
     RESULT_SP1+=("$sp1_time")
+    RESULT_SP1_AXIS+=("$sp1_axis")
     RESULT_SP1_CYCLES+=("$sp1_cycles")
     RESULT_RATIO+=("$ratio")
 
     if [ -n "$REPORT_DIR" ]; then
-        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
             "$target_steps" \
             "$n" \
-            "$projection_steps" \
             "$lambda_time" \
+            "$lambda_axis" \
+            "$lambda_cycles" \
             "$sp1_time" \
+            "$sp1_axis" \
             "$sp1_cycles" \
             "$ratio" >> "$REPORT_DIR/results.tsv"
     fi
@@ -469,7 +503,7 @@ compute_projection() {
     fi
 
     read -r slope intercept r2 <<< "$(fit_series "$steps_slash" "$times_slash")"
-    projected_s=$(project_series "$slope" "$intercept" "$TARGET_STEPS")
+    projected_s=$(project_series "$slope" "$intercept" "$TARGET_CYCLES")
     projected_h=$(format_hours "$projected_s")
 
     case "$label" in
@@ -505,20 +539,21 @@ echo -e "Program: Fibonacci (u64 wrapping)"
 echo ""
 
 if $RUN_LAMBDA && $RUN_SP1; then
-    printf "  %-12s  %-12s  %12s  %12s  %12s  %8s\n" "Target steps" "Iterations" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio"
-    printf "  %-12s  %-12s  %12s  %12s  %12s  %8s\n" "------------" "----------" "---------" "------" "----------" "-----"
+    printf "  %-12s  %-12s  %14s  %14s  %14s  %14s  %8s\n" "Target steps" "Iterations" "Lambda (s)" "Lambda cycles" "SP1 (s)" "SP1 cycles" "Ratio"
+    printf "  %-12s  %-12s  %14s  %14s  %14s  %14s  %8s\n" "------------" "----------" "----------" "-------------" "-------" "----------" "-----"
 elif $RUN_LAMBDA; then
-    printf "  %-12s  %-12s  %12s\n" "Target steps" "Iterations" "Lambda VM"
-    printf "  %-12s  %-12s  %12s\n" "------------" "----------" "---------"
+    printf "  %-12s  %-12s  %14s  %14s\n" "Target steps" "Iterations" "Lambda (s)" "Lambda cycles"
+    printf "  %-12s  %-12s  %14s  %14s\n" "------------" "----------" "----------" "-------------"
 else
-    printf "  %-12s  %-12s  %12s  %12s\n" "Target steps" "Iterations" "SP1 v6" "SP1 cycles"
-    printf "  %-12s  %-12s  %12s  %12s\n" "------------" "----------" "------" "----------"
+    printf "  %-12s  %-12s  %14s  %14s\n" "Target steps" "Iterations" "SP1 (s)" "SP1 cycles"
+    printf "  %-12s  %-12s  %14s  %14s\n" "------------" "----------" "-------" "----------"
 fi
 
 for i in "${!RESULT_ITERATIONS[@]}"; do
     target_steps="${RESULT_TARGET_STEPS[$i]}"
     n="${RESULT_ITERATIONS[$i]}"
     lambda_time="${RESULT_LAMBDA[$i]}"
+    lambda_cycles="${RESULT_LAMBDA_CYCLES[$i]}"
     sp1_time="${RESULT_SP1[$i]}"
     sp1_cycles="${RESULT_SP1_CYCLES[$i]}"
     ratio="${RESULT_RATIO[$i]}"
@@ -531,19 +566,20 @@ for i in "${!RESULT_ITERATIONS[@]}"; do
             else
                 ratio_colored="${GREEN}${ratio_colored}${NC}"
             fi
-            printf "  %-12s  %-12s  %11ss  %11ss  %12s  " "$target_steps" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles"
+            printf "  %-12s  %-12s  %13ss  %14s  %13ss  %14s  " "$target_steps" "$n" "$lambda_time" "$lambda_cycles" "$sp1_time" "$sp1_cycles"
             echo -e "$ratio_colored"
         else
-            printf "  %-12s  %-12s  %12s  %12s  %12s  %8s\n" "$target_steps" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-"
+            printf "  %-12s  %-12s  %14s  %14s  %14s  %14s  %8s\n" "$target_steps" "$n" "${lambda_time}s" "$lambda_cycles" "${sp1_time}s" "$sp1_cycles" "-"
         fi
     elif $RUN_LAMBDA; then
-        printf "  %-12s  %-12s  %11ss\n" "$target_steps" "$n" "$lambda_time"
+        printf "  %-12s  %-12s  %13ss  %14s\n" "$target_steps" "$n" "$lambda_time" "$lambda_cycles"
     else
-        printf "  %-12s  %-12s  %11ss  %12s\n" "$target_steps" "$n" "$sp1_time" "$sp1_cycles"
+        printf "  %-12s  %-12s  %13ss  %14s\n" "$target_steps" "$n" "$sp1_time" "$sp1_cycles"
     fi
 done
 
 echo ""
+echo -e "Timing window covers single-shot end-to-end proving; SP1 includes setup; both exclude verification."
 if $RUN_LAMBDA && $RUN_SP1; then
     echo -e "Green ratio = Lambda VM faster, Red = SP1 faster"
 fi
@@ -551,14 +587,14 @@ echo "Raw data in $TMP_DIR/"
 
 if [ -n "$LAMBDA_PROJECTED_S" ] || [ -n "$SP1_PROJECTED_S" ]; then
     echo ""
-    echo -e "${BOLD}=== Linear Projection to ${TARGET_STEPS} Workload Steps ===${NC}"
-    echo "  Axis: target workload steps"
-    echo "  Note: when using iterations input, target steps are approximated as ${APPROX_STEPS_PER_ITERATION} * n"
+    echo -e "${BOLD}=== Linear Projection to ${TARGET_CYCLES} Cycles ===${NC}"
+    echo "  Axis: measured dynamic instruction count per prover (cycles). When cycle data is"
+    echo "        unavailable the script falls back to target_workload_steps ~= ${APPROX_STEPS_PER_ITERATION} * n."
     if [ -n "$LAMBDA_PROJECTED_S" ]; then
-        echo "  Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R²=${LAMBDA_R2}"
+        echo "  Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R2=${LAMBDA_R2}"
     fi
     if [ -n "$SP1_PROJECTED_S" ]; then
-        echo "  SP1 v6:    ${SP1_PROJECTED_S}s (${SP1_PROJECTED_H}h), R²=${SP1_R2}"
+        echo "  SP1 v6:    ${SP1_PROJECTED_S}s (${SP1_PROJECTED_H}h), R2=${SP1_R2}"
     fi
 fi
 
@@ -571,10 +607,13 @@ if [ -n "$REPORT_DIR" ]; then
         echo "target_steps_series=$(join_slash "${RESULT_TARGET_STEPS[@]}")"
         echo "iterations=$(join_slash "${RESULT_ITERATIONS[@]}")"
         echo "projection_axis=$PROJECTION_AXIS"
-        echo "target_steps=$TARGET_STEPS"
-        echo "projection_steps=$(join_slash "${RESULT_PROJECTION_STEPS[@]}")"
+        echo "timing_window=setup_plus_end_to_end_prove_no_verify"
+        echo "target_cycles=$TARGET_CYCLES"
         echo "lambda_times=$(join_slash "${RESULT_LAMBDA[@]}")"
+        echo "lambda_axis_values=$(join_slash "${RESULT_LAMBDA_AXIS[@]}")"
+        echo "lambda_cycles=$(join_slash "${RESULT_LAMBDA_CYCLES[@]}")"
         echo "sp1_times=$(join_slash "${RESULT_SP1[@]}")"
+        echo "sp1_axis_values=$(join_slash "${RESULT_SP1_AXIS[@]}")"
         echo "sp1_cycles=$(join_slash "${RESULT_SP1_CYCLES[@]}")"
         echo "ratios=$(join_slash "${RESULT_RATIO[@]}")"
         if [ -n "$LAMBDA_PROJECTED_S" ]; then
@@ -596,25 +635,27 @@ if [ -n "$REPORT_DIR" ]; then
     {
         echo "# Lambda VM vs SP1 v6 Benchmark"
         echo
-        echo "Projection axis: \`$PROJECTION_AXIS\`"
+        echo "Timing window: \`single-shot end-to-end prove\` (SP1 includes setup; both exclude verification and recursion)."
+        echo
+        echo "Projection axis: \`$PROJECTION_AXIS\` (measured dynamic instruction count per prover)."
         echo
-        echo "| Target steps | Iterations | Projection steps | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |"
-        echo "|-------------:|-----------:|-----------------:|--------------:|-----------:|-----------:|------:|"
+        echo "| Target steps | Iterations | Lambda VM (s) | Lambda cycles | SP1 v6 (s) | SP1 cycles | Ratio |"
+        echo "|-------------:|-----------:|--------------:|--------------:|-----------:|-----------:|------:|"
         for i in "${!RESULT_ITERATIONS[@]}"; do
             printf "| %s | %s | %s | %s | %s | %s | %s |\n" \
                 "${RESULT_TARGET_STEPS[$i]}" \
                 "${RESULT_ITERATIONS[$i]}" \
-                "${RESULT_PROJECTION_STEPS[$i]}" \
                 "${RESULT_LAMBDA[$i]}" \
+                "${RESULT_LAMBDA_CYCLES[$i]}" \
                 "${RESULT_SP1[$i]}" \
                 "${RESULT_SP1_CYCLES[$i]}" \
                 "${RESULT_RATIO[$i]}"
         done
         echo
-        echo "## Linear Projection to ${TARGET_STEPS} Workload Steps"
+        echo "## Linear Projection to ${TARGET_CYCLES} Cycles"
         echo
-        echo "| Prover | Slope (s / 1M workload steps) | Intercept (s) | R² | Projected @ ${TARGET_STEPS} (s) | Projected @ ${TARGET_STEPS} (h) |"
-        echo "|--------|-------------------------------:|--------------:|---:|------------------------------:|------------------------------:|"
+        echo "| Prover | Slope (s / 1M cycles) | Intercept (s) | R2 | Projected @ ${TARGET_CYCLES} (s) | Projected @ ${TARGET_CYCLES} (h) |"
+        echo "|--------|----------------------:|--------------:|---:|------------------------------:|------------------------------:|"
         if [ -n "$LAMBDA_PROJECTED_S" ]; then
             printf "| Lambda VM | %s | %s | %s | %s | %s |\n" \
                 "$LAMBDA_SLOPE" \
diff --git a/bench_vs/sp1/fibonacci/script/src/main.rs b/bench_vs/sp1/fibonacci/script/src/main.rs
index 761d0c911..85730518a 100644
--- a/bench_vs/sp1/fibonacci/script/src/main.rs
+++ b/bench_vs/sp1/fibonacci/script/src/main.rs
@@ -17,18 +17,20 @@ fn main() {
     let mut stdin = SP1Stdin::new();
     stdin.write(&n);
 
-    // Setup
-    let pk = client.setup(FIB_ELF.clone()).expect("setup failed");
-
-    // Execute for cycle count
+    // Cycle count — executed *before* the timer starts, matching Lambda's
+    // pre-pass for symmetry. This costs extra wall-clock but does not inflate
+    // the measured proving time.
     let (_, report) = client
         .execute(FIB_ELF.clone(), stdin.clone())
         .run()
         .unwrap();
     println!("Cycles: {}", report.total_instruction_count());
 
-    // Core proof (no recursion)
+    // Timed window: end-to-end single-shot proving, including `setup`
+    // (verifying-key derivation) and the `core` proof itself. No recursion /
+    // compression, no verification.
     let start = Instant::now();
+    let pk = client.setup(FIB_ELF.clone()).expect("setup failed");
     let proof = client
         .prove(&pk, stdin)
         .core()
@@ -38,7 +40,7 @@ fn main() {
 
     println!("Proving time: {:.3}s", elapsed.as_secs_f64());
 
-    // Verify
+    // Verify (outside the timer, same as Lambda).
     client
         .verify(&proof, pk.verifying_key(), None)
         .expect("verify failed");
diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs
index 3a1917a32..162a201ef 100644
--- a/bin/cli/src/main.rs
+++ b/bin/cli/src/main.rs
@@ -128,9 +128,13 @@ enum Commands {
         #[arg(long)]
         blowup: Option<u8>,
 
-        /// Print timing breakdown
+        /// Print proving time
         #[arg(long)]
         time: bool,
+
+        /// Execute one pre-pass outside the timer and print dynamic instruction count
+        #[arg(long)]
+        cycles: bool,
     },
 
     /// Verify a proof bundle
@@ -168,7 +172,8 @@ fn main() -> ExitCode {
             private_input,
             blowup,
             time,
-        } => cmd_prove(elf, output, private_input, blowup, time),
+            cycles,
+        } => cmd_prove(elf, output, private_input, blowup, time, cycles),
         Commands::Verify {
             proof,
             elf,
@@ -290,6 +295,7 @@ fn cmd_prove(
     private_input_path: Option<PathBuf>,
     blowup: Option<u8>,
     time: bool,
+    cycles: bool,
 ) -> ExitCode {
     eprintln!("Reading ELF file...");
     let elf_data = match std::fs::read(&elf_path) {
@@ -308,6 +314,35 @@ fn cmd_prove(
         }
     };
 
+    // Pre-pass: execute once outside the timer to count dynamic instructions.
+    // Mirrors SP1's cycle-count pass so both provers report the same kind of
+    // number without inflating the measured proving time.
+    let cycle_count = if cycles {
+        let program = match Elf::load(&elf_data) {
+            Ok(p) => p,
+            Err(e) => {
+                eprintln!("Failed to load ELF for cycle count: {:?}", e);
+                return ExitCode::FAILURE;
+            }
+        };
+        let executor = match Executor::new(&program, private_inputs.clone()) {
+            Ok(e) => e,
+            Err(e) => {
+                eprintln!("Failed to create executor for cycle count: {:?}", e);
+                return ExitCode::FAILURE;
+            }
+        };
+        match executor.run() {
+            Ok(result) => Some(result.logs.len() as u64),
+            Err(e) => {
+                eprintln!("Execution failed during cycle count: {:?}", e);
+                return ExitCode::FAILURE;
+            }
+        }
+    } else {
+        None
+    };
+
     #[cfg(feature = "jemalloc-stats")]
     let tracker = heap_tracker::HeapTracker::start();
 
@@ -370,6 +405,9 @@ fn cmd_prove(
     }
 
     eprintln!("Proof written to {:?}", output_path);
+    if let Some(c) = cycle_count {
+        println!("Cycles: {}", c);
+    }
     if time {
         println!("Proving time: {:.3}s", prove_elapsed.as_secs_f64());
     }

From 1e1459e0e3e34fa0a329af25666d4de52d2daabf Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 14 Apr 2026 15:06:54 -0300
Subject: [PATCH 14/34] Fix grep pipefail in   bench_vs/run.sh by switching to
 sed

---
 bench_vs/run.sh | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/bench_vs/run.sh b/bench_vs/run.sh
index f72f3731e..3784c6357 100755
--- a/bench_vs/run.sh
+++ b/bench_vs/run.sh
@@ -257,6 +257,22 @@ with open(path, "wb") as fh:
 PY
 }
 
+extract_proving_time() {
+    sed -nE '/Proving time: [0-9.]+s/ {
+        s/.*Proving time: ([0-9.]+)s.*/\1/
+        p
+        q
+    }'
+}
+
+extract_cycles() {
+    sed -nE '/Cycles: [0-9]+/ {
+        s/.*Cycles: ([0-9]+).*/\1/
+        p
+        q
+    }'
+}
+
 echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}"
 echo -e "Series mode: ${YELLOW}${SERIES_MODE}${NC}"
 echo -e "Requested series: ${YELLOW}${SERIES[*]}${NC}"
@@ -379,8 +395,8 @@ run_one() {
         fi
         rm -f "$proof_file"
 
-        lambda_time=$(echo "$lambda_output" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*')
-        lambda_cycles=$(echo "$lambda_output" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*')
+        lambda_time=$(printf "%s\n" "$lambda_output" | extract_proving_time)
+        lambda_cycles=$(printf "%s\n" "$lambda_output" | extract_cycles)
         if [ -z "$lambda_time" ]; then
             echo -e "  ${RED}[Lambda VM] FAILED: could not parse proving time${NC}"
             printf "%s\n" "$lambda_output"
@@ -411,8 +427,8 @@ run_one() {
             exit 1
         fi
 
-        sp1_time=$(grep -o 'Proving time: [0-9.]*s' "$sp1_output_file" | grep -o '[0-9.]*')
-        sp1_cycles=$(grep -o 'Cycles: [0-9]*' "$sp1_output_file" | grep -o '[0-9]*')
+        sp1_time=$(extract_proving_time < "$sp1_output_file")
+        sp1_cycles=$(extract_cycles < "$sp1_output_file")
         if [ -z "$sp1_time" ] || [ -z "$sp1_cycles" ]; then
             echo -e "  ${RED}[SP1 v6] FAILED: could not parse output${NC}"
             cat "$sp1_output_file"

From 2e65d5aacd8f5095bc5aef7e66e972fe2f6b56c1 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 16 Apr 2026 18:04:22 -0300
Subject: [PATCH 15/34] Add bench_vs_plonky3

---
 .github/workflows/bench-vs-p3-nightly.yml     |   51 +
 Cargo.lock                                    |  408 ++-
 Cargo.toml                                    |    9 +
 bench_vs_plonky3/ANALYSIS_LOG.md              |  432 +++
 bench_vs_plonky3/Cargo.toml                   |   56 +
 bench_vs_plonky3/INSTRUMENTATION.md           |  203 ++
 bench_vs_plonky3/benches/stark_comparison.rs  |  190 ++
 .../p3-goldilocks-patched/Cargo.toml          |  129 +
 .../benches/bench_field.rs                    |   72 +
 .../benches/extension.rs                      |   40 +
 .../src/aarch64_neon/mds.rs                   |  343 +++
 .../src/aarch64_neon/mod.rs                   |   12 +
 .../src/aarch64_neon/packing.rs               |  404 +++
 .../src/aarch64_neon/poseidon1.rs             |  716 +++++
 .../src/aarch64_neon/poseidon1_asm.rs         |  843 ++++++
 .../src/aarch64_neon/poseidon2.rs             |  652 ++++
 .../src/aarch64_neon/poseidon2_asm.rs         | 2621 +++++++++++++++++
 .../src/aarch64_neon/utils.rs                 |  400 +++
 .../p3-goldilocks-patched/src/extension.rs    |  217 ++
 .../p3-goldilocks-patched/src/goldilocks.rs   |  813 +++++
 .../p3-goldilocks-patched/src/lib.rs          |   42 +
 .../p3-goldilocks-patched/src/mds.rs          |  761 +++++
 .../p3-goldilocks-patched/src/poseidon1.rs    | 1143 +++++++
 .../p3-goldilocks-patched/src/poseidon2.rs    |  980 ++++++
 .../src/x86_64_avx2/mds.rs                    |   86 +
 .../src/x86_64_avx2/mod.rs                    |    3 +
 .../src/x86_64_avx2/packing.rs                |  539 ++++
 .../src/x86_64_avx512/mds.rs                  |   86 +
 .../src/x86_64_avx512/mod.rs                  |    3 +
 .../src/x86_64_avx512/packing.rs              |  444 +++
 bench_vs_plonky3/run.sh                       |  410 +++
 bench_vs_plonky3/src/bin/prove_bench.rs       |  185 ++
 bench_vs_plonky3/src/lambda_fibonacci_pair.rs |  326 ++
 bench_vs_plonky3/src/lib.rs                   |  341 +++
 bench_vs_plonky3/src/plonky3_config.rs        |   92 +
 bench_vs_plonky3/src/plonky3_fibonacci.rs     |  144 +
 36 files changed, 14171 insertions(+), 25 deletions(-)
 create mode 100644 .github/workflows/bench-vs-p3-nightly.yml
 create mode 100644 bench_vs_plonky3/ANALYSIS_LOG.md
 create mode 100644 bench_vs_plonky3/Cargo.toml
 create mode 100644 bench_vs_plonky3/INSTRUMENTATION.md
 create mode 100644 bench_vs_plonky3/benches/stark_comparison.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs
 create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs
 create mode 100755 bench_vs_plonky3/run.sh
 create mode 100644 bench_vs_plonky3/src/bin/prove_bench.rs
 create mode 100644 bench_vs_plonky3/src/lambda_fibonacci_pair.rs
 create mode 100644 bench_vs_plonky3/src/lib.rs
 create mode 100644 bench_vs_plonky3/src/plonky3_config.rs
 create mode 100644 bench_vs_plonky3/src/plonky3_fibonacci.rs

diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml
new file mode 100644
index 000000000..b8602d7d4
--- /dev/null
+++ b/.github/workflows/bench-vs-p3-nightly.yml
@@ -0,0 +1,51 @@
+name: Bench Vs Plonky3 Nightly
+
+on:
+  schedule:
+    # 04:30 America/Argentina/Buenos_Aires = 07:30 UTC
+    # SP1 nightly fires at 06:00 UTC (03:00 BA) and runs ~1.5h; scheduling 1.5h
+    # later leaves the self-hosted bench runner free.
+    - cron: "30 7 * * *"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: bench-vs-p3-nightly-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  bench-vs-p3:
+    runs-on: [self-hosted, bench]
+    timeout-minutes: 60
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Rust Environment
+        uses: ./.github/actions/setup-rust
+
+      - name: Add cargo to PATH
+        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Run nightly Plonky3 benchmark
+        run: |
+          bash ./bench_vs_plonky3/run.sh \
+            --log-rows 19 \
+            --num-sequences 16 \
+            --runs 3 \
+            --no-p3-patch \
+            --scalar \
+            --report-dir bench_vs_p3_artifacts \
+            --no-color
+
+      - name: Upload nightly benchmark artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: bench-vs-p3-nightly-${{ github.run_number }}-${{ github.sha }}
+          path: bench_vs_p3_artifacts
+          retention-days: 90
+
+      - name: Publish summary
+        run: cat bench_vs_p3_artifacts/summary.md >> "$GITHUB_STEP_SUMMARY"
diff --git a/Cargo.lock b/Cargo.lock
index f6eea84d6..ae5305254 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -293,6 +293,30 @@ version = "1.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06"
 
+[[package]]
+name = "bench-vs-plonky3"
+version = "0.1.0"
+dependencies = [
+ "criterion 0.4.0",
+ "crypto",
+ "math",
+ "p3-air",
+ "p3-challenger",
+ "p3-commit",
+ "p3-dft 0.5.2",
+ "p3-field 0.5.2",
+ "p3-fri",
+ "p3-goldilocks",
+ "p3-keccak",
+ "p3-matrix 0.5.2",
+ "p3-merkle-tree",
+ "p3-symmetric 0.5.2",
+ "p3-uni-stark",
+ "stark",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -1931,7 +1955,7 @@ dependencies = [
  "serde_arrays",
  "sha2",
  "sp1_bls12_381",
- "spin",
+ "spin 0.9.8",
 ]
 
 [[package]]
@@ -2019,6 +2043,15 @@ version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
 
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
 [[package]]
 name = "log"
 version = "0.4.29"
@@ -2221,6 +2254,17 @@ dependencies = [
  "sha2",
 ]
 
+[[package]]
+name = "p3-air"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f2ec9cbfc642fc5173817287c3f8b789d07743b5f7e812d058b7a03e344f9ab"
+dependencies = [
+ "p3-field 0.5.2",
+ "p3-matrix 0.5.2",
+ "tracing",
+]
+
 [[package]]
 name = "p3-baby-bear"
 version = "0.2.3-succinct"
@@ -2228,24 +2272,68 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7521838ecab2ddf4f7bc4ceebad06ec02414729598485c1ada516c39900820e8"
 dependencies = [
  "num-bigint 0.4.6",
- "p3-field",
- "p3-mds",
- "p3-poseidon2",
- "p3-symmetric",
+ "p3-field 0.2.3-succinct",
+ "p3-mds 0.2.3-succinct",
+ "p3-poseidon2 0.2.3-succinct",
+ "p3-symmetric 0.2.3-succinct",
  "rand 0.8.5",
  "serde",
 ]
 
+[[package]]
+name = "p3-challenger"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a0b490c745a7d2adeeafff06411814c8078c432740162332b3cd71be0158a76"
+dependencies = [
+ "p3-field 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-monty-31",
+ "p3-symmetric 0.5.2",
+ "p3-util 0.5.2",
+ "tracing",
+]
+
+[[package]]
+name = "p3-commit"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "916ae7989d5c3b49f887f5c55b2f9826bdbb81aaebf834503c4145d8b267c829"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-challenger",
+ "p3-dft 0.5.2",
+ "p3-field 0.5.2",
+ "p3-matrix 0.5.2",
+ "p3-util 0.5.2",
+ "serde",
+]
+
 [[package]]
 name = "p3-dft"
 version = "0.2.3-succinct"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "46414daedd796f1eefcdc1811c0484e4bced5729486b6eaba9521c572c76761a"
 dependencies = [
- "p3-field",
- "p3-matrix",
- "p3-maybe-rayon",
- "p3-util",
+ "p3-field 0.2.3-succinct",
+ "p3-matrix 0.2.3-succinct",
+ "p3-maybe-rayon 0.2.3-succinct",
+ "p3-util 0.2.3-succinct",
+ "tracing",
+]
+
+[[package]]
+name = "p3-dft"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55301e91544440254977108b85c32c09d7ea05f2f0dd61092a2825339906a4a7"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-field 0.5.2",
+ "p3-matrix 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-util 0.5.2",
+ "spin 0.10.0",
  "tracing",
 ]
 
@@ -2258,11 +2346,90 @@ dependencies = [
  "itertools 0.12.1",
  "num-bigint 0.4.6",
  "num-traits",
- "p3-util",
+ "p3-util 0.2.3-succinct",
  "rand 0.8.5",
  "serde",
 ]
 
+[[package]]
+name = "p3-field"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85affca7fc983889f260655c4cf74163eebb94605f702e4b6809ead707cba54f"
+dependencies = [
+ "itertools 0.14.0",
+ "num-bigint 0.4.6",
+ "p3-maybe-rayon 0.5.2",
+ "p3-util 0.5.2",
+ "paste",
+ "rand 0.10.1",
+ "serde",
+ "tracing",
+]
+
+[[package]]
+name = "p3-fri"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ac25574ed306b4c9ad1969faaecc0fe6081d45ad7e1ec236661a6e0e37b39e1"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-challenger",
+ "p3-commit",
+ "p3-dft 0.5.2",
+ "p3-field 0.5.2",
+ "p3-interpolation",
+ "p3-matrix 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-util 0.5.2",
+ "rand 0.10.1",
+ "serde",
+ "spin 0.10.0",
+ "thiserror 2.0.17",
+ "tracing",
+]
+
+[[package]]
+name = "p3-goldilocks"
+version = "0.5.2"
+dependencies = [
+ "num-bigint 0.4.6",
+ "p3-challenger",
+ "p3-dft 0.5.2",
+ "p3-field 0.5.2",
+ "p3-mds 0.5.2",
+ "p3-poseidon1",
+ "p3-poseidon2 0.5.2",
+ "p3-symmetric 0.5.2",
+ "p3-util 0.5.2",
+ "paste",
+ "rand 0.10.1",
+ "serde",
+]
+
+[[package]]
+name = "p3-interpolation"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14fd48db63ff15f5e96dc46e6991dbc2d39431b82dcb154bad90f4579236e328"
+dependencies = [
+ "p3-field 0.5.2",
+ "p3-matrix 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-util 0.5.2",
+]
+
+[[package]]
+name = "p3-keccak"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebcf27615ece1995e4fcf4c69740f1cf515d1481367a20b4b3ce7f4f1b8d70f7"
+dependencies = [
+ "p3-symmetric 0.5.2",
+ "p3-util 0.5.2",
+ "tiny-keccak",
+]
+
 [[package]]
 name = "p3-matrix"
 version = "0.2.3-succinct"
@@ -2270,20 +2437,44 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3e4de3f373589477cb735ea58e125898ed20935e03664b4614c7fac258b3c42f"
 dependencies = [
  "itertools 0.12.1",
- "p3-field",
- "p3-maybe-rayon",
- "p3-util",
+ "p3-field 0.2.3-succinct",
+ "p3-maybe-rayon 0.2.3-succinct",
+ "p3-util 0.2.3-succinct",
  "rand 0.8.5",
  "serde",
  "tracing",
 ]
 
+[[package]]
+name = "p3-matrix"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53428126b009071563d1d07305a9de8be0d21de00b57d2475289ee32ffca6577"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-field 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-util 0.5.2",
+ "rand 0.10.1",
+ "serde",
+ "tracing",
+]
+
 [[package]]
 name = "p3-maybe-rayon"
 version = "0.2.3-succinct"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c3968ad1160310296eb04f91a5f4edfa38fe1d6b2b8cd6b5c64e6f9b7370979e"
 
+[[package]]
+name = "p3-maybe-rayon"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "082bf467011c06c768c579ec6eb9accb5e1e62108891634cc770396e917f978a"
+dependencies = [
+ "rayon",
+]
+
 [[package]]
 name = "p3-mds"
 version = "0.2.3-succinct"
@@ -2291,14 +2482,81 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2356b1ed0add6d5dfbf7a338ce534a6fde827374394a52cec16a0840af6e97c9"
 dependencies = [
  "itertools 0.12.1",
- "p3-dft",
- "p3-field",
- "p3-matrix",
- "p3-symmetric",
- "p3-util",
+ "p3-dft 0.2.3-succinct",
+ "p3-field 0.2.3-succinct",
+ "p3-matrix 0.2.3-succinct",
+ "p3-symmetric 0.2.3-succinct",
+ "p3-util 0.2.3-succinct",
  "rand 0.8.5",
 ]
 
+[[package]]
+name = "p3-mds"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35209e6214102ea6ec6b8cb1b9c15a9b8e597a39f9173597c957f123bced81b3"
+dependencies = [
+ "p3-dft 0.5.2",
+ "p3-field 0.5.2",
+ "p3-symmetric 0.5.2",
+ "p3-util 0.5.2",
+ "rand 0.10.1",
+]
+
+[[package]]
+name = "p3-merkle-tree"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "182a5383a54c50f47866f819946d28d95262f69967902734de8fdecb0d70c774"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-commit",
+ "p3-field 0.5.2",
+ "p3-matrix 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-symmetric 0.5.2",
+ "p3-util 0.5.2",
+ "rand 0.10.1",
+ "serde",
+ "thiserror 2.0.17",
+ "tracing",
+]
+
+[[package]]
+name = "p3-monty-31"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffa8c99ec50c035020bbf5457c6a729ba6a975719c1a8dd3f16421081e4f650c"
+dependencies = [
+ "itertools 0.14.0",
+ "num-bigint 0.4.6",
+ "p3-dft 0.5.2",
+ "p3-field 0.5.2",
+ "p3-matrix 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-mds 0.5.2",
+ "p3-poseidon1",
+ "p3-poseidon2 0.5.2",
+ "p3-symmetric 0.5.2",
+ "p3-util 0.5.2",
+ "paste",
+ "rand 0.10.1",
+ "serde",
+ "spin 0.10.0",
+ "tracing",
+]
+
+[[package]]
+name = "p3-poseidon1"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a018b618e3fa0aec8be933b1d8e404edd23f46991f6bf3f5c2f3f95e9413fe9"
+dependencies = [
+ "p3-field 0.5.2",
+ "p3-symmetric 0.5.2",
+ "rand 0.10.1",
+]
+
 [[package]]
 name = "p3-poseidon2"
 version = "0.2.3-succinct"
@@ -2306,13 +2564,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da1eec7e1b6900581bedd95e76e1ef4975608dd55be9872c9d257a8a9651c3a"
 dependencies = [
  "gcd",
- "p3-field",
- "p3-mds",
- "p3-symmetric",
+ "p3-field 0.2.3-succinct",
+ "p3-mds 0.2.3-succinct",
+ "p3-symmetric 0.2.3-succinct",
  "rand 0.8.5",
  "serde",
 ]
 
+[[package]]
+name = "p3-poseidon2"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "256a668a9ba916f8767552f13d0ba50d18968bc74a623bfdafa41e2970c944d0"
+dependencies = [
+ "p3-field 0.5.2",
+ "p3-mds 0.5.2",
+ "p3-symmetric 0.5.2",
+ "p3-util 0.5.2",
+ "rand 0.10.1",
+]
+
 [[package]]
 name = "p3-symmetric"
 version = "0.2.3-succinct"
@@ -2320,10 +2591,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edb439bea1d822623b41ff4b51e3309e80d13cadf8b86d16ffd5e6efb9fdc360"
 dependencies = [
  "itertools 0.12.1",
- "p3-field",
+ "p3-field 0.2.3-succinct",
+ "serde",
+]
+
+[[package]]
+name = "p3-symmetric"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c60a71a1507c13611b0f2b0b6e83669fd5b76f8e3115bcbced5ccfdf3ca7807"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-field 0.5.2",
+ "p3-util 0.5.2",
  "serde",
 ]
 
+[[package]]
+name = "p3-uni-stark"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c4ecaad8a7b4cf0fc711278c7a29fdc6d14239157866b17feaf14061834bc51"
+dependencies = [
+ "itertools 0.14.0",
+ "p3-air",
+ "p3-challenger",
+ "p3-commit",
+ "p3-field 0.5.2",
+ "p3-matrix 0.5.2",
+ "p3-maybe-rayon 0.5.2",
+ "p3-util 0.5.2",
+ "serde",
+ "thiserror 2.0.17",
+ "tracing",
+]
+
 [[package]]
 name = "p3-util"
 version = "0.2.3-succinct"
@@ -2333,6 +2635,16 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "p3-util"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8b766b9e9254bf3fa98d76e42cf8a5b30628c182dfd5272d270076ee12f0fc0"
+dependencies = [
+ "serde",
+ "transpose",
+]
+
 [[package]]
 name = "pairing"
 version = "0.23.0"
@@ -2625,6 +2937,15 @@ dependencies = [
  "rand_core 0.9.3",
 ]
 
+[[package]]
+name = "rand"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
+dependencies = [
+ "rand_core 0.10.1",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.3.1"
@@ -2663,6 +2984,12 @@ dependencies = [
  "getrandom 0.3.4",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
+
 [[package]]
 name = "rand_xorshift"
 version = "0.4.0"
@@ -2906,6 +3233,12 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
 [[package]]
 name = "sec1"
 version = "0.7.3"
@@ -3118,9 +3451,9 @@ dependencies = [
  "lazy_static",
  "num-bigint 0.4.6",
  "p3-baby-bear",
- "p3-field",
- "p3-poseidon2",
- "p3-symmetric",
+ "p3-field 0.2.3-succinct",
+ "p3-poseidon2 0.2.3-succinct",
+ "p3-symmetric 0.2.3-succinct",
  "serde",
  "sha2",
 ]
@@ -3146,6 +3479,15 @@ version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
 
+[[package]]
+name = "spin"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591"
+dependencies = [
+ "lock_api",
+]
+
 [[package]]
 name = "spki"
 version = "0.7.3"
@@ -3189,6 +3531,12 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
+[[package]]
+name = "strength_reduce"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
@@ -3572,6 +3920,16 @@ dependencies = [
  "tracing-log",
 ]
 
+[[package]]
+name = "transpose"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
+dependencies = [
+ "num-integer",
+ "strength_reduce",
+]
+
 [[package]]
 name = "typenum"
 version = "1.19.0"
diff --git a/Cargo.toml b/Cargo.toml
index 4d10b7c44..886c206f2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
   "crypto/crypto",
   "crypto/math",
   "bin/cli",
+  "bench_vs_plonky3",
 ]
 
 resolver = "2"
@@ -18,3 +19,11 @@ debug = true
 
 # For profiling with samply/perf, build with:
 #   CARGO_PROFILE_RELEASE_DEBUG=1 cargo build --release
+
+# Patched p3-goldilocks adds a BinomiallyExtendable<3> impl for degree-3
+# extension (same as Lambda's x^3 - 2) and disables NEON packing on aarch64.
+# Used only by bench_vs_plonky3 for apples-to-apples comparisons against
+# Lambda STARK. The nightly workflow comments this block out at CI time to
+# benchmark vanilla p3-goldilocks (degree-2 extension).
+[patch.crates-io]
+p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" }
diff --git a/bench_vs_plonky3/ANALYSIS_LOG.md b/bench_vs_plonky3/ANALYSIS_LOG.md
new file mode 100644
index 000000000..ab19e9a1f
--- /dev/null
+++ b/bench_vs_plonky3/ANALYSIS_LOG.md
@@ -0,0 +1,432 @@
+# Lambda STARK vs Plonky3 — Analysis Log
+
+## Session: 2026-04-14 to 2026-04-16
+
+---
+
+## 0. Final Server Baseline (2026-04-16)
+
+**Config:** blowup=2, 219 queries, grinding=0, ext degree 3 both, scalar (no AVX2), parallel (rayon both), identical AIR (32 cols × 2^18).
+
+**Command:** `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench -p bench-vs-plonky3`
+
+### Prove
+
+| Prover | Time | Throughput |
+|--------|------|------------|
+| Lambda | **1.213 s** | 6.92 Melem/s |
+| Plonky3 | **479 ms** | 17.50 Melem/s |
+| **Ratio** | **2.53×** | |
+
+### Verify
+
+| Prover | Time |
+|--------|------|
+| Lambda | **23.3 ms** |
+| Plonky3 | **20.4 ms** |
+| **Ratio** | **1.14×** |
+
+### Gap attribution (734ms = 1213 - 479)
+
+Extension field is MATCHED (both degree 3). The 2.53× is pure algorithm/implementation:
+
+| Cause | Est. savings | % of gap | Effort |
+|-------|-------------|----------|--------|
+| **Quotient domain eval** (2^18 vs 2^19 LDE) | ~220ms | 30% | Low |
+| **Batched FFT** (coset_lde_batch vs per-column) | ~150ms | 20% | Medium |
+| **Alpha decomposition + monomorphization** | ~100ms | 14% | Medium-High |
+| **FRI folding parallel** | ~73ms | 10% | Very low |
+| **Boundary selectors** (vs zerofier precompute) | ~45ms | 6% | Low |
+| **Memory allocation patterns** | ~37ms | 5% | Low |
+| **SSE2 Keccak residual** (~7% hash advantage) | ~50ms | 7% | N/A (can't fix) |
+| Other (compilation, unrolling, tuning) | ~59ms | 8% | - |
+
+### Predicted instruments breakdown (blowup=2, 219q)
+
+| Phase | Predicted time | % |
+|-------|---------------|---|
+| FRI queries (R4) | 180ms | 15% ← NEW bottleneck (2.19× queries) |
+| R2 constraint eval | 168ms | 14% |
+| R4 deep comp poly | 131ms | 11% |
+| R1 Main Merkle | 105ms | 9% |
+| R4 FRI commit | 76ms | 6% |
+| R1 reconstruct LDE | 71ms | 6% |
+| R3 OOD eval | 71ms | 6% |
+| R1 Main LDE | 65ms | 5% |
+| R4 deep extend | 52ms | 4% |
+| R2 comp Merkle | 13ms | 1% |
+| Pre-pass | 11ms | 1% |
+
+### Optimization roadmap (ranked by impact/effort)
+
+| # | Optimization | Savings | Effort | Result |
+|---|-------------|---------|--------|--------|
+| 1 | Quotient domain (stride=blowup in evaluator) | ~80ms | 1h | 1.13s |
+| 2 | Parallel FRI fold (par_iter) | ~40ms | 30min | 1.09s |
+| 3 | Boundary selectors (replace zerofier precompute) | ~45ms | 2h | 1.05s |
+| 4 | LogUp alpha precompute | ~10ms | 30min | 1.04s |
+| 5 | Monomorphize constraints (enum dispatch) | ~35ms | 4h | 1.00s |
+| 6 | Batched FFT (coset_lde_batch pattern) | ~150ms | 8h | 0.85s |
+| 7 | Row-major trace storage | ~20ms | 8h | 0.83s |
+
+**With items 1-5 (~210ms, ~8h work):** Lambda ~1.0s vs Plonky3 0.48s = **2.08×**
+**With items 1-7 (~380ms, ~24h work):** Lambda ~0.83s vs Plonky3 0.48s = **1.73×**
+**Remaining gap** after all: ~350ms from SSE2 Keccak + deep comp + Plonky3 micro-optimizations
+
+### M1 instruments breakdown (with PR #492, blowup=2, ext3 both)
+
+**Command:** `RUSTFLAGS="-C target-feature=-sha3" cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture`
+
+| Fase | Lambda (1.068s) | % | Plonky3 (352ms) | % | Ratio |
+|------|-----------------|---|-----------------|---|-------|
+| Trace commit (LDE+Merkle) | 317ms (LDE 127 + Merkle 165) | 30% | 138ms (commit to trace data) | 39% | 2.3× |
+| **Constraint eval** | **325ms** | **30%** | **50ms** (quotient_values) | **14%** | **6.5×** |
+| Quotient commit | 53ms | 5% | 49ms | 14% | 1.1× |
+| OOD eval | 62ms | 6% | ~10ms (Lagrange interp) | 3% | 6.2× |
+| Deep comp poly | 173ms | 16% | (inside "open") | | |
+| Deep extend | 36ms | 3% | | | |
+| FRI commit (folding+Merkle) | 83ms | 8% | 47ms (commit phase) | 13% | 1.8× |
+| FRI queries | 1ms | 0% | 2ms (query phase) | 1% | — |
+| Open total | 293ms | 27% | 110ms | 31% | 2.7× |
+| Pre-pass | 7ms | 1% | — | | |
+
+---
+
+## Fairness Audit
+
+### AIR equivalence: VERIFIED
+
+Both AIRs prove the same mathematical statement:
+- 32 cols × 2^18 rows, 2-row window
+- Constraint 1: `next_left = local_left + local_right`
+- Constraint 2: `next_right = local_right + next_left`
+- Boundary: row 0 pins `(a_s, b_s) = (s+1, s+2)` per sequence
+- Test `lambda_pair_trace_matches_plonky3_trace` verifies ALL cells (not subset)
+- Mathematical trace for seq (1,2): (1,2)→(3,5)→(8,13)→(21,34) — identical both sides
+
+### Parameters: ALL MATCHED (except noted)
+
+| Parameter | Lambda | Plonky3 | Status |
+|-----------|--------|---------|--------|
+| Base field | Goldilocks | Goldilocks | ✅ |
+| Extension | degree 3 (`x³−2`) | degree 3 (`x³−2`, vendored) | ✅ |
+| Blowup | 2 | 2 (log_blowup=1) | ✅ |
+| FRI queries | 219 | 219 | ✅ |
+| Grinding | 0 | 0 | ✅ |
+| Hash | Keccak-256 | Keccak-256 | ✅ |
+| Rayon | ON | ON (p3-uni-stark/parallel + p3-dft/parallel) | ✅ |
+| SIMD Goldilocks | OFF | OFF (NEON patched to `Self`) | ✅ |
+| SIMD Keccak (x86) | scalar (sha3 crate) | SSE2 2-wide | ⚠️ residual |
+| SIMD Keccak (M1 with -sha3) | scalar | scalar (fallback) | ✅ |
+
+### Platform fairness guide
+
+| Platform | Command | Keccak P3 | Goldilocks P3 | Fairness |
+|----------|---------|-----------|---------------|----------|
+| **M1 + `-sha3`** | `RUSTFLAGS="-C target-feature=-sha3" cargo bench ...` | Scalar | Scalar | **100% fair** |
+| M1 no flags | `cargo bench ...` | NEON SHA3 HW | Scalar | P3 has Keccak HW |
+| **x86 + `-avx2,-avx512f`** | `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ...` | SSE2 2-wide | Scalar | ~93% fair |
+| x86 no flags | `cargo bench ...` | AVX2 4-wide | AVX2 4-wide | P3 has full SIMD |
+
+**For fairest comparison: M1 with `-sha3`** — only platform where everything is scalar both sides.
+
+### Security model asymmetry (doesn't affect compute, affects interpretation)
+
+- **Lambda (Johnson Bound, proven):** 219 queries × 0.49 bits/query = **~108 bits** proven security
+- **Plonky3 (ethSTARK conjecture):** 219 queries × 1.0 bit/query = **~219 bits** conjectured (cap 192 by field)
+- Same 219 queries = same computational work. Different security interpretation.
+- For "matched security" at 108 conjectured bits, P3 would need only ~108 queries (half the FRI work)
+
+### What's NOT unfairness (architectural differences = what we measure)
+
+These are implementation choices, not benchmark bias:
+- Quotient domain eval (P3) vs full LDE eval (Lambda) → 6.5× constraint eval
+- Monomorphization (P3) vs vtable dispatch (Lambda) → ~1.2× overhead
+- Batched FFT (P3) vs per-column (Lambda) → ~2× trace commit
+- Row-major (P3) vs column-major (Lambda) → cache efficiency
+- Boundary selectors (P3) vs zerofier precompute (Lambda) → ~2× boundary cost
+
+### What IS potential unfairness
+
+1. SSE2 Keccak on x86 — P3 gets 2-wide Keccak, Lambda doesn't. ~7% of total. Unavoidable on x86.
+2. Lambda samples NO extra LogUp/bus challenges for this AIR (verified: `has_aux_trace() = false` skips sampling).
+3. Lambda wraps in `multi_prove` with vec of 1 — transcript clone overhead is negligible.
+
+**Conclusion: The benchmark is fair for comparing prover implementation efficiency.**
+
+---
+
+## 1. Benchmark Setup
+
+### AIR (identical both sides)
+- 16 Fibonacci sequences, 2 cols/sequence = **32 columns**
+- **2^18 rows** (each row packs 2 Fibonacci steps → 2^19 effective steps)
+- 2-row window: `next.left = local.left + local.right`, `next.right = local.right + next.left`
+- 32 boundary constraints pinning initial values via public inputs
+- Test `lambda_pair_trace_matches_plonky3_trace` verifies cell-by-cell equivalence
+
+### Matched parameters
+- Base field: Goldilocks (p = 2^64 − 2^32 + 1)
+- Blowup: 4
+- FRI queries: 100
+- Grinding: 0
+- Hash: Keccak-256 (scalar on both sides when `-C target-feature=-sha3`)
+
+### Unmatched (architectural)
+- **Extension field:** Lambda degree 3 (`x^3 - 2`, 192-bit), Plonky3 degree 2 (`x^2 - 7`, 128-bit)
+  - Plonky3 0.5.2 has Goldilocks extensions for degree 2 and 5, but NOT degree 3
+  - Lambda ext-mul: 9 base muls + 3 reduce128
+  - Plonky3 ext-mul: 4 base muls + 2 adds
+- **Prover architecture:** Lambda multi_prove (even for 1 AIR), Plonky3 uni-stark
+
+### Patches applied
+1. `bench_vs_plonky3/vendor-p3-goldilocks/` — `Packing = Self` on aarch64 (disables NEON)
+2. `p3-uni-stark` and `p3-dft` features `["parallel"]` enabled
+3. `stark` feature `parallel` enabled by default in bench
+
+### Files
+- `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` — Lambda AIR matching P3 shape
+- `bench_vs_plonky3/src/plonky3_fibonacci.rs` — Plonky3 AIR
+- `bench_vs_plonky3/src/plonky3_config.rs` — P3 config (matched FRI params)
+- `bench_vs_plonky3/benches/stark_comparison.rs` — Criterion benchmark
+- `bench_vs_plonky3/vendor-p3-goldilocks/` — Patched p3-goldilocks (no NEON)
+- Root `Cargo.toml` — `[patch.crates-io]` for vendor p3-goldilocks
+
+---
+
+## 2. Measurements
+
+### Config A: Both rayon, no SIMD, no SHA3 HW (M1 Max)
+
+Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3`
+
+| | Lambda | Plonky3 | Ratio |
+|--|--------|---------|-------|
+| **Prove** | **2.09s** [1.99, 2.20] | **0.86s** [0.84, 0.87] | **P3 2.43× faster** |
+| **Verify** | **6.58ms** | **6.76ms** | **Lambda 1.03× faster** |
+
+### Config B: Lambda rayon ON, Plonky3 rayon OFF, NEON ON (M1 — earlier run)
+
+Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3` (before adding p3 parallel features)
+
+| | Lambda | Plonky3 | Ratio |
+|--|--------|---------|-------|
+| **Prove** | **3.46s** | **2.92s** | **P3 1.18× faster** |
+
+### Config C: Lambda rayon ON, Plonky3 rayon OFF, NEON ON, SHA3 HW ON (M1 — first run)
+
+Command: `cargo bench -p bench-vs-plonky3` (no RUSTFLAGS)
+
+| | Lambda | Plonky3 | Ratio |
+|--|--------|---------|-------|
+| **Prove** | **3.21s** | **1.67s** | **P3 1.92× faster** |
+
+### Server instruments breakdown (Lambda only, 16 cols × 2^18 pair AIR)
+
+Total: **1.246s**
+
+| Phase | Time | % |
+|-------|------|---|
+| R2 constraint eval | 336ms | 27% |
+| R1 Main Merkle | 211ms | 17% |
+| R1 reconstruct (re-LDE) | 143ms | 11% |
+| R4 deep comp poly | 131ms | 11% |
+| R1 Main LDE | 130ms | 10% |
+| R4 FRI commit | 80ms | 6% |
+| R3 OOD eval | 71ms | 6% |
+| R2 comp Merkle | 54ms | 4% |
+| R4 deep extend | 43ms | 3% |
+| Pre-pass | 11ms | 1% |
+
+---
+
+## 3. Root Cause Analysis
+
+### Why Plonky3 is ~2.4× faster (Config A)
+
+#### 3a. Constraint eval domain: 4× overhead (biggest factor)
+- Lambda evaluates constraints on full LDE domain: `N × blowup = 2^20 points` (`evaluator.rs:274`)
+- Plonky3 evaluates on quotient domain: `N = 2^18 points`, then extends via iFFT + FFT
+- Lambda does 4× more constraint evaluations (each involving ext-field ops, frame fill, zerofier division)
+- **Estimated contribution: 1.5-2× of the gap**
+
+#### 3b. Extension field degree 3 vs 2
+- Lambda: 9 base muls per ext-mul (`extensions_goldilocks.rs:293-309`)
+- Plonky3: 4 base muls per ext-mul (`binomial_extension.rs:747-762`)
+- Affects: composition poly, FRI folding, DEEP openings, OOD
+- **Estimated contribution: 1.3-1.5× of the gap**
+
+#### 3c. Virtual dispatch vs monomorphization
+- Lambda: `Vec<Box<dyn TransitionConstraint>>` → vtable call per constraint per point (`traits.rs:248-250`)
+- Plonky3: `air.eval(&mut folder)` → monomorphized, all constraints inlined
+- For 32 constraints × 2^20 points = 32M vtable dispatches in Lambda
+- **Estimated contribution: 1.1-1.2× of the gap**
+
+#### 3d. Data layout: column-major vs row-major
+- Lambda: column-major (cache miss per column access in constraint loop)
+- Plonky3: row-major (contiguous data per row)
+- **Estimated contribution: 1.05-1.1× of the gap**
+
+#### 3e. FRI folding sequential vs parallel
+- Lambda: sequential loop in `fold_evaluations_in_place` (`fri_functions.rs:21`)
+- Plonky3: `par_rows()` parallelized
+- **Estimated contribution: 1.03-1.05× of the gap**
+
+#### Combined: 1.5 × 1.4 × 1.15 × 1.07 × 1.04 ≈ **2.7× (close to measured 2.43×)**
+
+### Why verify is roughly equal
+- Verify doesn't do LDE, Merkle, or constraint eval
+- Only ~100 point openings + FRI check
+- Extension field penalty minimal at small N
+- Lambda's implementation is competitive on this path
+
+---
+
+## 4. SIMD Analysis (from profiling session)
+
+### NEON (aarch64/M1)
+- `target_feature="neon"` and `target_feature="sha3"` are **default on aarch64-apple-darwin**
+- Plonky3 uses `PackedGoldilocksNeon` (WIDTH=2) unconditionally on aarch64 via `#[cfg(target_arch = "aarch64")]`
+- Plonky3 Keccak uses NEON SHA3 instructions (`veor3q_u64`, `vbcaxq_u64`, etc.)
+- Lambda has NO SIMD in the prover
+- **Goldilocks NEON base-field mul is 0.92× SLOWER** than scalar (no native 64×64→128 on NEON)
+- **Fp3 NEON mul is 1.40× faster** (parallelism helps with 3 components)
+- **FFT with SIMD was 0.88× (slower)** due to pack/unpack overhead
+
+### Disabling SIMD
+- NEON packing: patched via `vendor-p3-goldilocks` (`type Packing = Self` on aarch64)
+- SHA3 hardware Keccak: `-C target-feature=-sha3` (RUSTFLAGS)
+- Cannot disable NEON via RUSTFLAGS alone (intrinsics used without `#[target_feature]` annotation)
+
+### x86_64 (server)
+- Without `-C target-cpu=native`: only SSE2 (no AVX2) → Plonky3 scalar too
+- With AVX2: `PackedGoldilocksAVX2` (WIDTH=4) — has native `mulq` so SIMD IS beneficial
+- For fair scalar comparison on x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"`
+
+---
+
+## 5. Plonky3 Parallelism
+
+- `p3-maybe-rayon` feature `parallel` is NOT enabled by default
+- Without it, all `par_iter()` calls fall back to `core::iter` (sequential)
+- `Radix2DitParallel` is "parallel" in name only without the feature
+- Must explicitly enable: `p3-uni-stark = { version = "0.5.2", features = ["parallel"] }` + `p3-dft = ...`
+- Verified via `cargo tree -e features | grep p3-maybe-rayon`
+
+---
+
+## 6. Lambda Profiling Results (server, profile_prover, 2^20 × 16 cols)
+
+### Single-threaded (38.7s)
+| Component | % | Category |
+|-----------|---|----------|
+| Constraint evaluation | 32.1% | Compute |
+| Keccak hashing | 15.1% | Hashing |
+| Deep composition poly | 14.0% | Compute |
+| Merkle tree build | 12.0% | Hashing |
+| Field multiplication | 11.1% | Compute |
+| FFT | 10.5% | FFT |
+| Other | 5.2% | |
+
+### Parallel (12 threads, 19.2s — 2.02× speedup)
+| Metric | Value |
+|--------|-------|
+| Parallel efficiency | 16.8% of ideal 12× |
+| CPU utilization | 30.6% |
+| Main thread work | 13.3s |
+| Worker thread work | ~5s each |
+| New #1 bottleneck | Keccak (16.7%) |
+
+### Key profiling findings
+- 100% CPU-bound (no memory/IO stalls)
+- SIMD PackedGoldilocks types exist but are NOT used by prover
+- Iterator overhead (Map::fold + FnMut): 7.6%
+- Memory allocation overhead: 8.9% (page faults + malloc + cfree)
+- Amdahl's Law: ~34% serial portion limits parallel speedup
+
+---
+
+## 7. Optimizations Implemented (then stashed)
+
+### Item 2: Parallel FRI folding
+- File: `crypto/stark/src/fri/fri_functions.rs`
+- Change: `(0..half).into_par_iter().map().collect()` with `#[cfg(feature = "parallel")]`
+- Also: `crypto/stark/src/fri/mod.rs` — added `Send + Sync` bounds
+- Tests: 450/450 passed (121 stark + 326 VM + 3 bench)
+
+### Item 3: Quotient domain constraint evaluation
+- File: `crypto/stark/src/constraints/evaluator.rs` — added `lde_stride: usize` parameter
+- File: `crypto/stark/src/prover.rs` — when `number_of_parts == 1`, uses `lde_stride = blowup_factor`
+  then extends N evaluations to LDE via `interpolate_offset_fft + evaluate_polynomial_on_lde_domain`
+- Tests: 450/450 passed
+- Impact on M1: 2.09s → 2.02s (~3%, within Criterion noise)
+- Impact limited because iFFT+FFT extension cost offsets constraint eval savings
+
+### Why stashed
+User wants clean baseline first (fair comparison), then optimize. These changes are ready to re-apply.
+
+---
+
+## 8. Optimization Priority (from profiling data)
+
+### With parallel enabled (real-world scenario)
+
+| # | Optimization | Impact (parallel) | Effort | Status |
+|---|-------------|-------------------|--------|--------|
+| 1 | PR 492 (LDE cache) | 5-8% (reduces serial) | Done (PR open) | Waiting merge |
+| 2 | BLAKE3 hash | ~12% (Keccak is parallel bottleneck) | Low | Not started |
+| 3 | Quotient domain eval | 3-5% (constraint eval parallelized already) | Medium | Implemented, stashed |
+| 4 | Reduce allocations | 5-8% | Medium | Not started |
+| 5 | Parallel FRI fold | ~3% | Low | Implemented, stashed |
+| 6 | Monomorphize constraints | 3-5% | High | Not started |
+
+### Plonky3 degree-3 extension (Option C)
+- Would eliminate the last asymmetric variable in the comparison
+- Requires implementing `BinomiallyExtendable<3>` for Goldilocks in vendored crate
+- Need Sage computation for: `DTH_ROOT = 2^((p-1)/3)`, `EXT_GENERATOR`
+- Expected: gap drops from 2.43× to ~1.5-1.7× (confirms extension degree accounts for ~40% of gap)
+
+---
+
+## 9. How to Run
+
+### M1 / aarch64 (scalar comparison)
+```bash
+RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3
+```
+
+### x86_64 server (scalar comparison, no AVX2)
+```bash
+cargo bench -p bench-vs-plonky3
+# or explicitly: RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ...
+```
+
+### With instruments (Lambda phase breakdown)
+```bash
+# Add "instruments" to stark features in bench_vs_plonky3/Cargo.toml first
+cargo bench -p bench-vs-plonky3 --features stark/instruments
+```
+
+### Verify correctness
+```bash
+cargo test -p bench-vs-plonky3  # 3 tests
+cargo test -p stark --lib       # 121 tests
+cargo test -p lambda-vm-prover  # 326 tests
+```
+
+---
+
+## 10. Key Files Reference
+
+| File | Purpose |
+|------|---------|
+| `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` | Lambda AIR (32 cols, 2-row window) |
+| `bench_vs_plonky3/src/plonky3_fibonacci.rs` | Plonky3 AIR (matching) |
+| `bench_vs_plonky3/src/plonky3_config.rs` | P3 config (FRI params matched) |
+| `bench_vs_plonky3/benches/stark_comparison.rs` | Criterion benchmark |
+| `bench_vs_plonky3/vendor-p3-goldilocks/` | Patched p3-goldilocks (no NEON) |
+| `crypto/stark/src/constraints/evaluator.rs` | Constraint eval loop (bottleneck) |
+| `crypto/stark/src/prover.rs` | Prover pipeline (Round 1-4) |
+| `crypto/stark/src/fri/fri_functions.rs` | FRI folding |
+| `crypto/stark/src/domain.rs` | LDE domain definition |
+| `crypto/math/src/fft/polynomial.rs` | FFT / coset_lde_full_expand |
diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
new file mode 100644
index 000000000..a3d4e02e2
--- /dev/null
+++ b/bench_vs_plonky3/Cargo.toml
@@ -0,0 +1,56 @@
+[package]
+name = "bench-vs-plonky3"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+# Lambda STARK
+stark = { path = "../crypto/stark", features = ["test-utils"] }
+crypto = { path = "../crypto/crypto", features = ["std", "serde"] }
+math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"] }
+
+# Plonky3 (all 0.5.2)
+p3-air = "0.5.2"
+p3-field = "0.5.2"
+p3-goldilocks = "0.5.2"
+p3-matrix = "0.5.2"
+p3-commit = "0.5.2"
+p3-challenger = "0.5.2"
+p3-symmetric = "0.5.2"
+p3-merkle-tree = "0.5.2"
+p3-keccak = "0.5.2"
+p3-fri = "0.5.2"
+p3-uni-stark = { version = "0.5.2", features = ["parallel"] }
+p3-dft = { version = "0.5.2", features = ["parallel"] }
+
+# Tracing for P3 span-based profiling
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
+
+[dev-dependencies]
+criterion = { version = "0.4", default-features = false }
+
+[features]
+# Both provers run multi-threaded by default: Plonky3's `Radix2DitParallel` DFT
+# uses rayon unconditionally, so Lambda must also enable `parallel` for a fair
+# apples-to-apples comparison. Disable with `--no-default-features` to compare
+# single-threaded.
+#
+# `p3-degree3` (default on) selects the cubic extension for Plonky3's
+# Challenge type, matching Lambda's `Degree3GoldilocksExtensionField`. It
+# requires the root `[patch.crates-io]` pointing at p3-goldilocks-patched.
+# Disable it (`--no-default-features --features parallel`) together with
+# commenting the patch block to build against vanilla crates.io
+# p3-goldilocks (degree-2 extension).
+default = ["parallel", "p3-degree3"]
+parallel = ["stark/parallel"]
+instruments = ["stark/instruments"]
+p3-degree3 = []
+
+[[bin]]
+name = "prove_bench"
+path = "src/bin/prove_bench.rs"
+
+[[bench]]
+name = "stark_comparison"
+harness = false
diff --git a/bench_vs_plonky3/INSTRUMENTATION.md b/bench_vs_plonky3/INSTRUMENTATION.md
new file mode 100644
index 000000000..0d82afe0e
--- /dev/null
+++ b/bench_vs_plonky3/INSTRUMENTATION.md
@@ -0,0 +1,203 @@
+# `bench_vs_plonky3` — puntos de instrumentación
+
+Guía de referencia para revisores / handoff. Describe **dónde está cada timer
+y qué mide** en la comparación Lambda STARK vs Plonky3. No describe el AIR
+en sí (eso vive en `ANALYSIS_LOG.md`).
+
+## Cómo correrlo
+
+El test que imprime el breakdown se llama `instruments_breakdown`. Hay que
+compilar con la feature `instruments` y pasar `--nocapture` porque la salida
+va a stdout (si no, `cargo test` se la come).
+
+**M1 (100% scalar, fairest):**
+
+```bash
+RUSTFLAGS="-C target-feature=-sha3" \
+cargo test -p bench-vs-plonky3 --features instruments --release -- \
+  instruments_breakdown --nocapture
+```
+
+**x86 (Goldilocks scalar, SSE2 Keccak residual en P3):**
+
+```bash
+RUSTFLAGS="-C target-feature=-avx2,-avx512f" \
+cargo test -p bench-vs-plonky3 --features instruments --release -- \
+  instruments_breakdown --nocapture
+```
+
+## Entrada principal
+
+- Archivo: `bench_vs_plonky3/src/lib.rs`
+- Función: `instruments_breakdown` (línea 82)
+- AIR Fibonacci fijo:
+  - `num_sequences = 16`
+  - `rows = 1 << 18` (2^18)
+  - columns = 32 (2 por secuencia)
+  - `blowup_factor = 2`
+  - `fri_number_of_queries = 219`
+  - `grinding_factor = 0`
+
+El test hace dos pasadas independientes:
+
+1. Corre Lambda STARK con los timers internos del crate `stark` (feature
+   `instruments`).
+2. Corre Plonky3 con un `tracing_subscriber` custom que captura spans.
+
+## Feature flags
+
+`bench_vs_plonky3/Cargo.toml` (líneas 33-40):
+
+```toml
+[features]
+default    = ["parallel"]
+parallel   = ["stark/parallel"]
+instruments = ["stark/instruments"]
+```
+
+`crypto/stark/Cargo.toml` (líneas 35-41):
+
+```toml
+[features]
+instruments = []                       # prints de timing en prover/verifier
+parallel    = ["dep:rayon", "crypto/parallel"]
+```
+
+`instruments` y `parallel` **coexisten** (no son excluyentes). En la práctica
+los benchmarks corren siempre con ambos activos: Plonky3 usa
+`Radix2DitParallel` (rayon) unconditionally, así que Lambda también tiene que
+correr en paralelo para comparar apples-to-apples.
+
+## Lambda: estructuras de timing
+
+`crypto/stark/src/instruments.rs`.
+
+### `MultiProveTiming` (líneas 40-50)
+
+Recolectada dentro de `multi_prove` y consumida por el test vía
+`stark::instruments::take()`.
+
+| Campo | Qué mide |
+|---|---|
+| `prepass` | Construcción de domains + `LdeTwiddles` caches. |
+| `main_commits` | Round 1 Phase A: commit de todos los main traces. |
+| `aux_build` | Round 1 Phase B: construcción de aux traces / LogUp. |
+| `aux_commit` | Round 1 Phase B: LDE + Merkle commit de aux traces. |
+| `rounds_2_4` | Tiempo total de Rounds 2-4 (todas las tablas). |
+| `round1_sub` | Sub-op breakdown de Round 1 (`Round1SubOps`). |
+| `table_timings` | Por tabla: `(name, rows, duration, TableSubOps)`. |
+
+### `Round1SubOps` (líneas 28-37)
+
+Sub-ops dentro de Round 1. Se acumulan en `AtomicU64`, así que workers rayon
+las pueden incrementar en paralelo sin perder datos.
+
+| Campo | Qué mide |
+|---|---|
+| `main_lde` | Main trace: `expand_columns_to_lde` (LDE/FFT). |
+| `main_merkle` | Main trace: `commit_columns_bit_reversed` (Merkle). |
+| `aux_lde` | Aux trace: `expand_columns_to_lde`. |
+| `aux_merkle` | Aux trace: `commit_columns_bit_reversed`. |
+
+### `TableSubOps` (líneas 7-24)
+
+Por tabla, dentro de Rounds 2-4. Las partes de R2/R4 se pasan por
+thread-locals (`R2_SUB`, `R4_SUB`) y después se ensamblan en
+`prove_rounds_2_to_4` (ver más abajo).
+
+| Campo | Round | Qué mide |
+|---|---|---|
+| `constraints` | R2 | `evaluator.evaluate()` — constraints sobre dominio LDE. |
+| `comp_decompose` | R2 | `decompose_and_extend_d2` — iFFT + extensión del composition poly. |
+| `comp_commit` | R2 | Merkle commit del composition poly. |
+| `ood` | R3 | Barycentric OOD eval (ver nota sobre dónde se captura). |
+| `deep_comp` | R4 | `compute_deep_composition_poly_evaluations`. |
+| `deep_extend` | R4 | `interpolate_fft` + `evaluate_fft` para extender el deep comp poly. |
+| `fri_commit` | R4 | `fri::commit_phase_from_evaluations` (folds + Merkle layers). |
+| `queries` | R4 | Grinding (si hay) + sampling + FRI query phase + Merkle openings. |
+
+### Dónde se capturan (en `crypto/stark/src/prover.rs`)
+
+- `multi_prove` (línea 1490):
+  - `reset_all()` (1502).
+  - `prepass` timer (1515-1533).
+  - `main_commits` timer (1541-…).
+  - `aux_build`, `aux_commit` timers (durante Round 1 Phase B).
+  - `rounds_2_4` timer; al final: `store(MultiProveTiming)`.
+- `round_2_compute_composition_polynomial` — `constraints` / `comp_decompose` /
+  `comp_commit` (vía `store_r2_sub`).
+- `prove_rounds_2_to_4` — **acá** se captura el OOD:
+  `round_3_dur = t_r3.elapsed()` en líneas 1957-1967, y se guarda en
+  `TableSubOps.ood` (línea 2010). `round_3_evaluate_polynomials_in_out_of_domain_element`
+  **no** tiene instrumentación propia.
+- `round_4_compute_and_run_fri_on_the_deep_composition_polynomial` —
+  `deep_comp` / `deep_extend` / `fri_commit` / `queries`
+  (vía `store_r4_sub`).
+
+## Plonky3: breakdown por spans
+
+Todo vive dentro de `instruments_breakdown` en `bench_vs_plonky3/src/lib.rs`,
+después del bloque de Lambda.
+
+- Se define una `P3TimingLayer` custom (líneas 216-259) que implementa
+  `tracing_subscriber::Layer`:
+  - `on_new_span` guarda el nombre del span.
+  - `on_enter` guarda `Instant::now()`.
+  - `on_close` calcula `start.elapsed()` y lo empuja a un `Vec<(name, ms)>`.
+- Se monta un subscriber con `LevelFilter::DEBUG` (línea 266) y se instala
+  como default **sólo durante el `p3_uni_stark::prove`** (líneas 275-280,
+  scope con `_guard`).
+- Post-prove: orden descendente por duración (287), filtra spans con
+  `ms >= 0.1` (289), y calcula `(unaccounted) = total − Σspans` (293-301).
+
+### Qué implica el diseño
+
+- **La capa no filtra por crate**: captura *cualquier* span DEBUG emitido
+  mientras el subscriber está vivo. En la práctica sólo corre
+  `p3_uni_stark::prove` dentro de ese bloque, así que todos los spans que
+  salen son de Plonky3 — pero si alguien agrega un `#[instrument]` propio
+  dentro del scope del guard, también se va a contar.
+- **No hay instrumentación manual de funciones de Plonky3.** La granularidad
+  del breakdown = spans que Plonky3 ya emite internamente.
+- **Nesting / doble-conteo:** P3 tiene spans anidados (p.ej.
+  `prove ⊃ compute_quotient_values ⊃ evaluate_constraints`). Cada span se
+  cuenta una vez con su wall-clock entre `on_enter` y `on_close`, así que
+  **`Σspans > wall-clock` es esperable, no es un bug**. Consecuencia:
+  `(unaccounted) = total − Σspans` **puede quedar negativo** en presencia de
+  nesting — no significa que falte tiempo, significa que los spans padre se
+  solapan con sus hijos. El código sólo imprime `(unaccounted)` si
+  `> 1.0ms`, así que casos negativos se silencian.
+
+## Segunda capa de instrumentación (no la usa `bench_vs_plonky3`)
+
+Existe una capa adicional en `prover/src/instruments.rs` (líneas 54-211,
+`print_report`) — orientada al ejecutor del VM (execute + trace build + AIR
+construction) que además re-imprime el `MultiProveTiming` del STARK con
+otro formato. `bench_vs_plonky3` **no** la invoca; sólo consume
+`stark::instruments::take()` directamente. Vale la pena saberlo si buscás
+timings y aparecen en logs distintos.
+
+## Advertencias para el revisor
+
+1. Lambda: timing manual, específico del pipeline `multi_prove`. Granularidad
+   fina pero acoplada al código — moverlo rompe los breakpoints.
+2. Plonky3: span-based. Granularidad = la que P3 decida exponer. Si P3 deja
+   de emitir un span en una versión futura, la línea desaparece del reporte
+   sin previo aviso.
+3. Los porcentajes de Lambda se calculan contra el **total wall-clock del
+   test** (no contra `rounds_2_4`), así que la suma no cierra al 100% — hay
+   tiempo fuera de `multi_prove` (construcción de AIR, setup).
+4. Los porcentajes de Plonky3 se calculan contra **`p3_prove_dur`** (solo el
+   `prove`, sin setup).
+5. El benchmark usa **degree 3** para la extensión de Plonky3 *sólo* si el
+   root `Cargo.toml` mantiene:
+   ```toml
+   [patch.crates-io]
+   p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" }
+   ```
+   (línea 26). Sin ese patch, P3 usa la extensión degree 2 de upstream y la
+   comparación deja de ser fair.
+6. Plataforma:
+   - M1: `RUSTFLAGS="-C target-feature=-sha3"` → scalar en ambos lados.
+   - x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` → Goldilocks scalar,
+     residual SSE2 en Keccak de P3 (~7%).
diff --git a/bench_vs_plonky3/benches/stark_comparison.rs b/bench_vs_plonky3/benches/stark_comparison.rs
new file mode 100644
index 000000000..fd90ae7b5
--- /dev/null
+++ b/bench_vs_plonky3/benches/stark_comparison.rs
@@ -0,0 +1,190 @@
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+use math::field::element::FieldElement;
+use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
+use math::field::goldilocks::GoldilocksField;
+use p3_uni_stark::{prove as p3_prove, verify as p3_verify};
+use stark::proof::options::ProofOptions;
+use stark::prover::{IsStarkProver, Prover};
+use stark::verifier::{IsStarkVerifier, Verifier};
+
+use bench_vs_plonky3::lambda_fibonacci_pair;
+use bench_vs_plonky3::plonky3_config;
+use bench_vs_plonky3::plonky3_fibonacci;
+
+type F = GoldilocksField;
+type E = Degree3GoldilocksExtensionField;
+type FE = FieldElement<F>;
+
+/// Number of independent Fibonacci sequences.
+const NUM_SEQUENCES: usize = 16;
+
+/// Rows (same for both Lambda and Plonky3 — identical AIR shape).
+///
+/// 2^18 rows × 2 Fibonacci steps packed per row = 2^19 effective Fibonacci
+/// steps per sequence, matching Lambda's original `FibonacciMultiColumnAIR`
+/// at 2^19 rows × 1 step/row.
+const ROWS: usize = 1 << 18;
+const TRACE_LABEL: &str = "fib_pair_16seq_2^18";
+
+/// Production proof options: blowup=2, 219 queries (from
+/// `GoldilocksCubicProofOptions::with_blowup(2)`), grinding=0 (excluded
+/// from benchmark — identical PoW work on both sides, not informative).
+fn benchmark_proof_options() -> ProofOptions {
+    ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 219,
+        coset_offset: 3,
+        grinding_factor: 0,
+    }
+}
+
+fn lambda_initial_values() -> Vec<(FE, FE)> {
+    (0..NUM_SEQUENCES)
+        .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
+        .collect()
+}
+
+fn bench_lambda_prove(c: &mut Criterion) {
+    let mut group = c.benchmark_group("lambda_stark/prove");
+    group.throughput(Throughput::Elements(
+        (ROWS * 2 * NUM_SEQUENCES) as u64,
+    ));
+    let proof_options = benchmark_proof_options();
+
+    group.bench_with_input(
+        BenchmarkId::new("fibonacci", TRACE_LABEL),
+        &ROWS,
+        |b, &rows| {
+            b.iter_with_setup(
+                || {
+                    let initial_values = lambda_initial_values();
+                    let trace = lambda_fibonacci_pair::compute_trace::<F, E>(
+                        &initial_values,
+                        rows,
+                    );
+                    let pub_inputs =
+                        lambda_fibonacci_pair::create_public_inputs(initial_values);
+                    let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
+                        &proof_options,
+                        NUM_SEQUENCES,
+                    );
+                    (trace, pub_inputs, air)
+                },
+                |(mut trace, pub_inputs, air)| {
+                    Prover::<F, E, _>::prove(
+                        &air,
+                        &mut trace,
+                        &pub_inputs,
+                        &mut DefaultTranscript::<E>::new(&[]),
+                    )
+                    .unwrap()
+                },
+            );
+        },
+    );
+    group.finish();
+}
+
+fn bench_plonky3_prove(c: &mut Criterion) {
+    let mut group = c.benchmark_group("plonky3_stark/prove");
+    group.throughput(Throughput::Elements(
+        (ROWS * 2 * NUM_SEQUENCES) as u64,
+    ));
+
+    group.bench_with_input(
+        BenchmarkId::new("fibonacci", TRACE_LABEL),
+        &ROWS,
+        |b, &rows| {
+            b.iter_with_setup(
+                || {
+                    let config = plonky3_config::matched_params_config();
+                    let air = plonky3_fibonacci::P3FibonacciAir {
+                        num_sequences: NUM_SEQUENCES,
+                    };
+                    let trace =
+                        plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, rows);
+                    let pis = plonky3_fibonacci::public_values(NUM_SEQUENCES);
+                    (config, air, trace, pis)
+                },
+                |(config, air, trace, pis)| p3_prove(&config, &air, trace, &pis),
+            );
+        },
+    );
+    group.finish();
+}
+
+fn bench_lambda_verify(c: &mut Criterion) {
+    let mut group = c.benchmark_group("lambda_stark/verify");
+    group.throughput(Throughput::Elements(
+        (ROWS * 2 * NUM_SEQUENCES) as u64,
+    ));
+    let proof_options = benchmark_proof_options();
+
+    let initial_values = lambda_initial_values();
+    let mut trace = lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, ROWS);
+    let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values);
+    let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
+        &proof_options,
+        NUM_SEQUENCES,
+    );
+    let proof = Prover::<F, E, _>::prove(
+        &air,
+        &mut trace,
+        &pub_inputs,
+        &mut DefaultTranscript::<E>::new(&[]),
+    )
+    .unwrap();
+
+    group.bench_with_input(BenchmarkId::new("fibonacci", TRACE_LABEL), &ROWS, |b, _| {
+        b.iter(|| {
+            assert!(Verifier::<F, E, _>::verify(
+                &proof,
+                &air,
+                &mut DefaultTranscript::<E>::new(&[]),
+            ))
+        });
+    });
+    group.finish();
+}
+
+fn bench_plonky3_verify(c: &mut Criterion) {
+    let mut group = c.benchmark_group("plonky3_stark/verify");
+    group.throughput(Throughput::Elements(
+        (ROWS * 2 * NUM_SEQUENCES) as u64,
+    ));
+
+    let air = plonky3_fibonacci::P3FibonacciAir {
+        num_sequences: NUM_SEQUENCES,
+    };
+    let trace = plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, ROWS);
+    let pis = plonky3_fibonacci::public_values(NUM_SEQUENCES);
+    let config = plonky3_config::matched_params_config();
+    let proof = p3_prove(&config, &air, trace, &pis);
+
+    group.bench_with_input(BenchmarkId::new("fibonacci", TRACE_LABEL), &ROWS, |b, _| {
+        b.iter(|| {
+            let config = plonky3_config::matched_params_config();
+            p3_verify(&config, &air, &proof, &pis).unwrap();
+        });
+    });
+    group.finish();
+}
+
+criterion_group! {
+    name = prove_comparison;
+    config = Criterion::default()
+        .sample_size(10)
+        .measurement_time(std::time::Duration::from_secs(120));
+    targets = bench_lambda_prove, bench_plonky3_prove
+}
+
+criterion_group! {
+    name = verify_comparison;
+    config = Criterion::default()
+        .sample_size(10)
+        .measurement_time(std::time::Duration::from_secs(30));
+    targets = bench_lambda_verify, bench_plonky3_verify
+}
+
+criterion_main!(prove_comparison, verify_comparison);
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml b/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml
new file mode 100644
index 000000000..768a2bb5a
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml
@@ -0,0 +1,129 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2024"
+name = "p3-goldilocks"
+version = "0.5.2"
+build = false
+autolib = false
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "An implementation of the Goldilocks prime field F_p, where p = 2^64 - 2^32 + 1."
+homepage = "https://github.com/Plonky3/Plonky3"
+readme = false
+keywords = [
+    "cryptography",
+    "SNARK",
+    "PLONK",
+    "FRI",
+    "plonky3",
+]
+categories = ["cryptography::cryptocurrencies"]
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/Plonky3/Plonky3"
+resolver = "2"
+
+[lib]
+name = "p3_goldilocks"
+path = "src/lib.rs"
+
+[[bench]]
+name = "bench_field"
+path = "benches/bench_field.rs"
+harness = false
+
+[[bench]]
+name = "extension"
+path = "benches/extension.rs"
+harness = false
+
+[dependencies.num-bigint]
+version = "0.4.6"
+default-features = false
+
+[dependencies.p3-challenger]
+version = "0.5.2"
+
+[dependencies.p3-dft]
+version = "0.5.2"
+
+[dependencies.p3-field]
+version = "0.5.2"
+
+[dependencies.p3-mds]
+version = "0.5.2"
+
+[dependencies.p3-poseidon1]
+version = "0.5.2"
+
+[dependencies.p3-poseidon2]
+version = "0.5.2"
+
+[dependencies.p3-symmetric]
+version = "0.5.2"
+
+[dependencies.p3-util]
+version = "0.5.2"
+
+[dependencies.paste]
+version = "1.0.15"
+
+[dependencies.rand]
+version = "0.10.0"
+default-features = false
+
+[dependencies.serde]
+version = "1.0"
+features = ["derive"]
+default-features = false
+
+[dev-dependencies.criterion]
+version = "0.8"
+
+[dev-dependencies.proptest]
+version = "1.10"
+
+[dev-dependencies.rand]
+version = "0.10.0"
+default-features = false
+
+[lints.clippy]
+cognitive_complexity = "allow"
+match_bool = "warn"
+needless_pass_by_value = "warn"
+redundant_pub_crate = "allow"
+semicolon_if_nothing_returned = "warn"
+too_long_first_doc_paragraph = "allow"
+transmute_undefined_repr = "allow"
+tuple_array_conversions = "allow"
+unused_peekable = "allow"
+
+[lints.clippy.all]
+level = "warn"
+priority = -1
+
+[lints.clippy.nursery]
+level = "warn"
+priority = -1
+
+[lints.rust]
+rust_2024_incompatible_pat = "warn"
+unused_must_use = "deny"
+
+[lints.rust.rust_2018_idioms]
+level = "deny"
+priority = -1
+
+[lints.rustdoc]
+all = "warn"
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs
new file mode 100644
index 000000000..a0d5e05f4
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs
@@ -0,0 +1,72 @@
+use core::any::type_name;
+
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
+use p3_field::{Field, PrimeCharacteristicRing};
+use p3_field_testing::bench_func::{
+    benchmark_add_latency, benchmark_add_throughput, benchmark_chunked_linear_combination,
+    benchmark_inv, benchmark_iter_sum, benchmark_sub_latency, benchmark_sub_throughput,
+};
+use p3_field_testing::{
+    benchmark_dot_array, benchmark_mul_latency, benchmark_mul_throughput, benchmark_sum_array,
+};
+use p3_goldilocks::Goldilocks;
+use rand::rngs::SmallRng;
+use rand::{RngExt, SeedableRng};
+
+type F = Goldilocks;
+
+fn bench_field(c: &mut Criterion) {
+    let name = "Goldilocks";
+    const REPS: usize = 200;
+    benchmark_mul_latency::<F, 100>(c, name);
+    benchmark_mul_throughput::<F, 25>(c, name);
+    benchmark_inv::<F>(c, name);
+    benchmark_iter_sum::<F, 4, REPS>(c, name);
+    benchmark_sum_array::<F, 4, REPS>(c, name);
+
+    benchmark_dot_array::<F, 1>(c, name);
+    benchmark_dot_array::<F, 2>(c, name);
+    benchmark_dot_array::<F, 3>(c, name);
+    benchmark_dot_array::<F, 4>(c, name);
+    benchmark_dot_array::<F, 5>(c, name);
+    benchmark_dot_array::<F, 6>(c, name);
+
+    // Note that each round of throughput has 10 operations
+    // So we should have 10 * more repetitions for latency tests.
+    const L_REPS: usize = 10 * REPS;
+    benchmark_add_latency::<F, L_REPS>(c, name);
+    benchmark_add_throughput::<F, REPS>(c, name);
+    benchmark_sub_latency::<F, L_REPS>(c, name);
+    benchmark_sub_throughput::<F, REPS>(c, name);
+
+    benchmark_chunked_linear_combination::<F, F, 100>(c, name);
+
+    let mut rng = SmallRng::seed_from_u64(1);
+    c.bench_function("7th_root", |b| {
+        b.iter_batched(
+            || rng.random::<F>(),
+            |x| x.exp_u64(10540996611094048183),
+            BatchSize::SmallInput,
+        );
+    });
+}
+fn bench_packedfield(c: &mut Criterion) {
+    let name = type_name::<<F as Field>::Packing>().to_string();
+    // Note that each round of throughput has 10 operations
+    // So we should have 10 * more repetitions for latency tests.
+    const REPS: usize = 100;
+    const L_REPS: usize = 10 * REPS;
+
+    benchmark_add_latency::<<F as Field>::Packing, L_REPS>(c, &name);
+    benchmark_add_throughput::<<F as Field>::Packing, REPS>(c, &name);
+    benchmark_sub_latency::<<F as Field>::Packing, L_REPS>(c, &name);
+    benchmark_sub_throughput::<<F as Field>::Packing, REPS>(c, &name);
+    benchmark_mul_latency::<<F as Field>::Packing, L_REPS>(c, &name);
+    benchmark_mul_throughput::<<F as Field>::Packing, REPS>(c, &name);
+
+    type PF = <F as Field>::Packing;
+    benchmark_chunked_linear_combination::<F, PF, 100>(c, &name);
+}
+
+criterion_group!(goldilocks_arithmetic, bench_field, bench_packedfield);
+criterion_main!(goldilocks_arithmetic);
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs
new file mode 100644
index 000000000..f4bf7e750
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs
@@ -0,0 +1,40 @@
+use criterion::{Criterion, criterion_group, criterion_main};
+use p3_field::extension::BinomialExtensionField;
+use p3_field_testing::bench_func::{
+    benchmark_inv, benchmark_mul_latency, benchmark_mul_throughput, benchmark_square,
+};
+use p3_field_testing::benchmark_mul;
+use p3_goldilocks::Goldilocks;
+
+type EF2 = BinomialExtensionField<Goldilocks, 2>;
+type EF5 = BinomialExtensionField<Goldilocks, 5>;
+
+// Note that each round of throughput has 10 operations
+// So we should have 10 * more repetitions for latency tests.
+const REPS: usize = 50;
+const L_REPS: usize = 10 * REPS;
+
+fn bench_quadratic_extension(c: &mut Criterion) {
+    let name = "BinomialExtensionField<Goldilocks, 2>";
+    benchmark_square::<EF2>(c, name);
+    benchmark_inv::<EF2>(c, name);
+    benchmark_mul::<EF2>(c, name);
+    benchmark_mul_throughput::<EF2, REPS>(c, name);
+    benchmark_mul_latency::<EF2, L_REPS>(c, name);
+}
+
+fn bench_quintic_extension(c: &mut Criterion) {
+    let name = "BinomialExtensionField<Goldilocks, 5>";
+    benchmark_square::<EF5>(c, name);
+    benchmark_inv::<EF5>(c, name);
+    benchmark_mul::<EF5>(c, name);
+    benchmark_mul_throughput::<EF5, REPS>(c, name);
+    benchmark_mul_latency::<EF5, L_REPS>(c, name);
+}
+
+criterion_group!(
+    bench_goldilocks_ef,
+    bench_quadratic_extension,
+    bench_quintic_extension
+);
+criterion_main!(bench_goldilocks_ef);
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs
new file mode 100644
index 000000000..9d4b410d3
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs
@@ -0,0 +1,343 @@
+//! MDS permutation for Goldilocks on aarch64.
+
+use core::arch::aarch64::*;
+use core::mem::transmute;
+
+use p3_mds::MdsPermutation;
+use p3_symmetric::Permutation;
+
+use super::packing::PackedGoldilocksNeon;
+use super::utils::{pack_lanes, unpack_lanes};
+use crate::{Goldilocks, MdsMatrixGoldilocks};
+
+// ---------------------------------------------------------------------------
+// Packed MdsMatrixGoldilocks (delegates to scalar Karatsuba per lane)
+// ---------------------------------------------------------------------------
+
+/// Apply the scalar MDS to each lane of a packed NEON state independently.
+#[inline]
+fn mds_packed<const WIDTH: usize>(
+    mds: &MdsMatrixGoldilocks,
+    input: &mut [PackedGoldilocksNeon; WIDTH],
+) where
+    MdsMatrixGoldilocks: Permutation<[Goldilocks; WIDTH]>,
+{
+    let (mut lane0, mut lane1) = unpack_lanes(input);
+    unsafe {
+        mds.permute_mut(&mut *(&mut lane0 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH]));
+        mds.permute_mut(&mut *(&mut lane1 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH]));
+    }
+    pack_lanes(input, &lane0, &lane1);
+}
+
+impl Permutation<[PackedGoldilocksNeon; 8]> for MdsMatrixGoldilocks {
+    fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 8]) {
+        mds_packed(self, input);
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksNeon, 8> for MdsMatrixGoldilocks {}
+
+impl Permutation<[PackedGoldilocksNeon; 12]> for MdsMatrixGoldilocks {
+    fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 12]) {
+        mds_packed(self, input);
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksNeon, 12> for MdsMatrixGoldilocks {}
+
+// ---------------------------------------------------------------------------
+// NEON-accelerated circulant MDS (16-bit chunk multiply-accumulate)
+// ---------------------------------------------------------------------------
+
+/// Goldilocks identity: `2^64 ≡ 2^32 − 1 (mod P)`.
+const EPSILON_U32: u32 = 0xffffffff;
+
+/// Reduce two accumulated 4×32-bit chunk vectors back to Goldilocks field
+/// elements. Each `uint32x4_t` holds four 32-bit accumulators representing
+/// the four 16-bit chunks of a Goldilocks element:
+///
+/// ```text
+///     elem = c[0] + c[1]·2¹⁶ + c[2]·2³² + c[3]·2⁴⁸
+/// ```
+///
+/// Returns two Goldilocks values packed in a `uint64x2_t`.
+///
+/// Ported from plonky2.
+#[inline(always)]
+unsafe fn mds_reduce([cumul_a, cumul_b]: [uint32x4_t; 2]) -> uint64x2_t {
+    unsafe {
+        let mut lo = vreinterpretq_u64_u32(vuzp1q_u32(cumul_a, cumul_b));
+        let mut hi = vreinterpretq_u64_u32(vuzp2q_u32(cumul_a, cumul_b));
+
+        hi = vsraq_n_u64::<16>(hi, lo);
+        lo = vsliq_n_u64::<16>(lo, hi);
+
+        let top = {
+            let hi_u8 = vreinterpretq_u8_u64(hi);
+            let top_idx =
+                transmute::<[u8; 8], uint8x8_t>([0x06, 0x07, 0xff, 0xff, 0x0e, 0x0f, 0xff, 0xff]);
+            let top_u8 = vqtbl1_u8(hi_u8, top_idx);
+            vreinterpret_u32_u8(top_u8)
+        };
+
+        let adj_lo = vmlal_n_u32(lo, top, EPSILON_U32);
+        let wraparound_mask = vcgtq_u64(lo, adj_lo);
+        vsraq_n_u64::<32>(adj_lo, wraparound_mask)
+    }
+}
+
+/// NEON-accelerated width-8 circulant MDS.
+///
+/// Circulant first row: `[7, 1, 3, 8, 8, 3, 4, 9]`
+/// (matches `MATRIX_CIRC_MDS_8_SML_ROW`).
+#[inline(always)]
+pub unsafe fn mds_neon_w8(state: &[u64; 8]) -> [u64; 8] {
+    unsafe {
+        const ROW: [u32; 8] = [7, 1, 3, 8, 8, 3, 4, 9];
+
+        const M: [[u32; 8]; 8] = {
+            let mut m = [[0u32; 8]; 8];
+            let mut i = 0;
+            while i < 8 {
+                let mut j = 0;
+                while j < 8 {
+                    m[i][j] = ROW[(j + 8 - i) % 8];
+                    j += 1;
+                }
+                i += 1;
+            }
+            m
+        };
+
+        let c: [uint32x4_t; 8] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i])));
+
+        let mut res = [0u64; 8];
+
+        let mut pair = 0;
+        while pair < 4 {
+            let i0 = 2 * pair;
+            let i1 = i0 + 1;
+
+            let mut a0 = vdupq_n_u32(0);
+            let mut a1 = vdupq_n_u32(0);
+
+            let mut j = 0;
+            while j < 8 {
+                a0 = vmlaq_n_u32(a0, c[j], M[i0][j]);
+                a1 = vmlaq_n_u32(a1, c[j], M[i1][j]);
+                j += 1;
+            }
+
+            let r = mds_reduce([a0, a1]);
+            res[i0] = vgetq_lane_u64::<0>(r);
+            res[i1] = vgetq_lane_u64::<1>(r);
+            pair += 1;
+        }
+
+        res
+    }
+}
+
+/// NEON-accelerated width-12 circulant MDS.
+///
+/// Circulant first row: `[1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]`
+/// (matches `MATRIX_CIRC_MDS_12_SML_ROW`).
+#[inline(always)]
+pub unsafe fn mds_neon_w12(state: &[u64; 12]) -> [u64; 12] {
+    unsafe {
+        const ROW: [u32; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10];
+
+        const M: [[u32; 12]; 12] = {
+            let mut m = [[0u32; 12]; 12];
+            let mut i = 0;
+            while i < 12 {
+                let mut j = 0;
+                while j < 12 {
+                    m[i][j] = ROW[(j + 12 - i) % 12];
+                    j += 1;
+                }
+                i += 1;
+            }
+            m
+        };
+
+        let c: [uint32x4_t; 12] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i])));
+
+        let mut res = [0u64; 12];
+
+        let mut pair = 0;
+        while pair < 6 {
+            let i0 = 2 * pair;
+            let i1 = i0 + 1;
+
+            let mut a0 = vdupq_n_u32(0);
+            let mut a1 = vdupq_n_u32(0);
+
+            let mut j = 0;
+            while j < 12 {
+                a0 = vmlaq_n_u32(a0, c[j], M[i0][j]);
+                a1 = vmlaq_n_u32(a1, c[j], M[i1][j]);
+                j += 1;
+            }
+
+            let r = mds_reduce([a0, a1]);
+            res[i0] = vgetq_lane_u64::<0>(r);
+            res[i1] = vgetq_lane_u64::<1>(r);
+            pair += 1;
+        }
+
+        res
+    }
+}
+
+/// NEON-accelerated MDS wrapper for use with the generic Poseidon1.
+///
+/// Zero-sized type that implements `Permutation<[Goldilocks; 8]>` and
+/// `Permutation<[Goldilocks; 12]>` using the NEON chunk technique. Plugs
+/// into `Poseidon1ExternalLayerGeneric` to accelerate full-round MDS while
+/// keeping LLVM-optimized partial rounds from the generic Poseidon1.
+#[derive(Clone, Debug, Default)]
+pub struct MdsNeonGoldilocks;
+
+impl Permutation<[Goldilocks; 8]> for MdsNeonGoldilocks {
+    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
+        let raw = unsafe { &*(state as *const [Goldilocks; 8] as *const [u64; 8]) };
+        let result = unsafe { mds_neon_w8(raw) };
+        *unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) } = result;
+    }
+}
+
+impl Permutation<[Goldilocks; 12]> for MdsNeonGoldilocks {
+    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
+        let raw = unsafe { &*(state as *const [Goldilocks; 12] as *const [u64; 12]) };
+        let result = unsafe { mds_neon_w12(raw) };
+        *unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) } = result;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field::PrimeField64;
+    use p3_symmetric::Permutation;
+    use rand::rngs::SmallRng;
+    use rand::{RngExt, SeedableRng};
+
+    use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksNeon};
+
+    type F = Goldilocks;
+
+    // -- Packed MdsMatrixGoldilocks tests --
+
+    macro_rules! test_neon_mds {
+        ($name:ident, $width:literal) => {
+            #[test]
+            fn $name() {
+                let mut rng = SmallRng::seed_from_u64(1);
+                let mds = MdsMatrixGoldilocks;
+
+                let input: [Goldilocks; $width] = rng.random();
+                let expected = mds.permute(input);
+
+                let packed_input = input.map(Into::<PackedGoldilocksNeon>::into);
+                let packed_output = mds.permute(packed_input);
+
+                let neon_output = packed_output.map(|x| x.0[0]);
+                assert_eq!(neon_output, expected);
+            }
+        };
+    }
+
+    test_neon_mds!(test_neon_mds_width_8, 8);
+    test_neon_mds!(test_neon_mds_width_12, 12);
+
+    // -- NEON MDS correctness tests --
+
+    #[test]
+    fn test_mds_neon_w8_matches_karatsuba() {
+        let mds = MdsMatrixGoldilocks;
+        let mut rng = SmallRng::seed_from_u64(42);
+
+        for _ in 0..100 {
+            let input: [F; 8] = rng.random();
+            let expected = mds.permute(input);
+
+            let raw: [u64; 8] = input.map(|x| x.as_canonical_u64());
+            let result = unsafe { super::mds_neon_w8(&raw) };
+
+            for i in 0..8 {
+                assert_eq!(
+                    F::new(result[i]).as_canonical_u64(),
+                    expected[i].as_canonical_u64(),
+                    "NEON MDS w8 mismatch at index {i}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_mds_neon_w12_matches_karatsuba() {
+        let mds = MdsMatrixGoldilocks;
+        let mut rng = SmallRng::seed_from_u64(43);
+
+        for _ in 0..100 {
+            let input: [F; 12] = rng.random();
+            let expected = mds.permute(input);
+
+            let raw: [u64; 12] = input.map(|x| x.as_canonical_u64());
+            let result = unsafe { super::mds_neon_w12(&raw) };
+
+            for i in 0..12 {
+                assert_eq!(
+                    F::new(result[i]).as_canonical_u64(),
+                    expected[i].as_canonical_u64(),
+                    "NEON MDS w12 mismatch at index {i}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_mds_neon_boundary_w8() {
+        let mds = MdsMatrixGoldilocks;
+        let p_minus_1 = F::ORDER_U64 - 1;
+
+        for &val in &[0u64, 1, p_minus_1] {
+            let input: [F; 8] = [F::new(val); 8];
+            let expected = mds.permute(input);
+
+            let raw = [val; 8];
+            let result = unsafe { super::mds_neon_w8(&raw) };
+
+            for i in 0..8 {
+                assert_eq!(
+                    F::new(result[i]).as_canonical_u64(),
+                    expected[i].as_canonical_u64(),
+                    "NEON MDS w8 boundary mismatch at index {i} for value {val}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_mds_neon_boundary_w12() {
+        let mds = MdsMatrixGoldilocks;
+        let p_minus_1 = F::ORDER_U64 - 1;
+
+        for &val in &[0u64, 1, p_minus_1] {
+            let input: [F; 12] = [F::new(val); 12];
+            let expected = mds.permute(input);
+
+            let raw = [val; 12];
+            let result = unsafe { super::mds_neon_w12(&raw) };
+
+            for i in 0..12 {
+                assert_eq!(
+                    F::new(result[i]).as_canonical_u64(),
+                    expected[i].as_canonical_u64(),
+                    "NEON MDS w12 boundary mismatch at index {i} for value {val}"
+                );
+            }
+        }
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs
new file mode 100644
index 000000000..82516a6cf
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs
@@ -0,0 +1,12 @@
+mod mds;
+mod packing;
+mod poseidon1;
+mod poseidon1_asm;
+mod poseidon2;
+mod poseidon2_asm;
+mod utils;
+
+pub use mds::MdsNeonGoldilocks;
+pub use packing::*;
+pub use poseidon1::*;
+pub use poseidon2::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs
new file mode 100644
index 000000000..f393c3b65
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs
@@ -0,0 +1,404 @@
+use alloc::vec::Vec;
+use core::arch::aarch64::{
+    uint64x2_t, vaddq_u64, vandq_u64, vbicq_u64, vcgtq_s64, vdupq_n_u64, veorq_u64, vgetq_lane_u64,
+    vreinterpretq_s64_u64, vsetq_lane_u64, vshrq_n_u64, vsubq_u64,
+};
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use p3_field::exponentiation::exp_10540996611094048183;
+use p3_field::op_assign_macros::{
+    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods,
+    impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field,
+    ring_sum,
+};
+use p3_field::{
+    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue,
+    PermutationMonomial, PrimeCharacteristicRing, PrimeField64,
+};
+use p3_util::reconstitute_from_base;
+use rand::distr::{Distribution, StandardUniform};
+use rand::{Rng, RngExt};
+
+use crate::{Goldilocks, P};
+
+const WIDTH: usize = 2;
+
+/// Equal to `2^32 - 1 = 2^64 mod P`.
+const EPSILON: u64 = Goldilocks::ORDER_U64.wrapping_neg();
+
+/// Vectorized NEON implementation of `Goldilocks` arithmetic.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+#[repr(transparent)]
+#[must_use]
+pub struct PackedGoldilocksNeon(pub [Goldilocks; WIDTH]);
+
+impl PackedGoldilocksNeon {
+    #[inline]
+    #[must_use]
+    pub(crate) fn to_vector(self) -> uint64x2_t {
+        unsafe { transmute(self) }
+    }
+
+    #[inline]
+    pub(crate) fn from_vector(vector: uint64x2_t) -> Self {
+        unsafe { transmute(vector) }
+    }
+
+    #[inline]
+    const fn broadcast(value: Goldilocks) -> Self {
+        Self([value; WIDTH])
+    }
+}
+
+impl From<Goldilocks> for PackedGoldilocksNeon {
+    fn from(x: Goldilocks) -> Self {
+        Self::broadcast(x)
+    }
+}
+
+impl Add for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Sub for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Neg for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::from_vector(neg(self.to_vector()))
+    }
+}
+
+impl Mul for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl_add_assign!(PackedGoldilocksNeon);
+impl_sub_assign!(PackedGoldilocksNeon);
+impl_mul_methods!(PackedGoldilocksNeon);
+ring_sum!(PackedGoldilocksNeon);
+impl_rng!(PackedGoldilocksNeon);
+
+impl PrimeCharacteristicRing for PackedGoldilocksNeon {
+    type PrimeSubfield = Goldilocks;
+
+    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
+    const ONE: Self = Self::broadcast(Goldilocks::ONE);
+    const TWO: Self = Self::broadcast(Goldilocks::TWO);
+    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f.into()
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::from_vector(halve(self.to_vector()))
+    }
+
+    #[inline]
+    fn square(&self) -> Self {
+        Self::from_vector(square(self.to_vector()))
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
+    }
+}
+
+impl InjectiveMonomial<7> for PackedGoldilocksNeon {}
+
+impl PermutationMonomial<7> for PackedGoldilocksNeon {
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl_add_base_field!(PackedGoldilocksNeon, Goldilocks);
+impl_sub_base_field!(PackedGoldilocksNeon, Goldilocks);
+impl_mul_base_field!(PackedGoldilocksNeon, Goldilocks);
+impl_div_methods!(PackedGoldilocksNeon, Goldilocks);
+impl_sum_prod_base_field!(PackedGoldilocksNeon, Goldilocks);
+
+impl Algebra<Goldilocks> for PackedGoldilocksNeon {
+    // Benchmarked on AArch64 NEON: chunk=2 ≈ 182ns, chunk=4 ≈ 198ns, chunk=8 ≈ 221ns.
+    const BATCHED_LC_CHUNK: usize = 2;
+}
+
+impl_packed_value!(PackedGoldilocksNeon, Goldilocks, WIDTH);
+
+unsafe impl PackedField for PackedGoldilocksNeon {
+    type Scalar = Goldilocks;
+}
+
+/// Interleave two 64-bit vectors at the element level.
+/// For block_len=1: [a0, a1] x [b0, b1] -> [a0, b0], [a1, b1]
+#[inline]
+pub fn interleave_u64(v0: uint64x2_t, v1: uint64x2_t) -> (uint64x2_t, uint64x2_t) {
+    unsafe {
+        let a0 = vgetq_lane_u64::<0>(v0);
+        let a1 = vgetq_lane_u64::<1>(v0);
+        let b0 = vgetq_lane_u64::<0>(v1);
+        let b1 = vgetq_lane_u64::<1>(v1);
+
+        // r0 = [a0, b0], r1 = [a1, b1]
+        let r0 = vsetq_lane_u64::<1>(b0, vsetq_lane_u64::<0>(a0, vdupq_n_u64(0)));
+        let r1 = vsetq_lane_u64::<1>(b1, vsetq_lane_u64::<0>(a1, vdupq_n_u64(0)));
+
+        (r0, r1)
+    }
+}
+
+unsafe impl PackedFieldPow2 for PackedGoldilocksNeon {
+    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
+        let (v0, v1) = (self.to_vector(), other.to_vector());
+        let (res0, res1) = match block_len {
+            1 => interleave_u64(v0, v1),
+            2 => (v0, v1),
+            _ => panic!("unsupported block length"),
+        };
+        (Self::from_vector(res0), Self::from_vector(res1))
+    }
+}
+
+// NEON arithmetic uses shifted representation (XOR with 2^63) for unsigned comparison.
+
+const SIGN_BIT: uint64x2_t = unsafe { transmute([i64::MIN as u64; WIDTH]) };
+const SHIFTED_FIELD_ORDER: uint64x2_t =
+    unsafe { transmute([Goldilocks::ORDER_U64 ^ (i64::MIN as u64); WIDTH]) };
+const EPSILON_VEC: uint64x2_t = unsafe { transmute([EPSILON; WIDTH]) };
+
+#[inline(always)]
+fn shift(x: uint64x2_t) -> uint64x2_t {
+    unsafe { veorq_u64(x, SIGN_BIT) }
+}
+
+#[inline(always)]
+unsafe fn canonicalize_s(x_s: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let x_s_signed = vreinterpretq_s64_u64(x_s);
+        let order_s_signed = vreinterpretq_s64_u64(SHIFTED_FIELD_ORDER);
+        let mask = vcgtq_s64(order_s_signed, x_s_signed);
+        let wrapback_amt = vbicq_u64(EPSILON_VEC, mask);
+        vaddq_u64(x_s, wrapback_amt)
+    }
+}
+
+#[inline(always)]
+unsafe fn add_no_double_overflow_64_64s_s(x: uint64x2_t, y_s: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let res_wrapped_s = vaddq_u64(x, y_s);
+        // After XOR shift, signed comparison correctly detects overflow.
+        // Overflow occurred iff y_s > res_wrapped_s (as signed, due to shift semantics)
+        let y_s_signed = vreinterpretq_s64_u64(y_s);
+        let res_s_signed = vreinterpretq_s64_u64(res_wrapped_s);
+        let mask = vcgtq_s64(y_s_signed, res_s_signed);
+        // wrapback_amt is EPSILON on overflow
+        let wrapback_amt = vshrq_n_u64::<32>(mask);
+        vaddq_u64(res_wrapped_s, wrapback_amt)
+    }
+}
+
+/// Goldilocks modular addition.
+#[inline]
+fn add(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let y_s = shift(y);
+        let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s));
+        shift(res_s)
+    }
+}
+
+/// Goldilocks modular subtraction.
+#[inline]
+fn sub(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let mut y_s = shift(y);
+        y_s = canonicalize_s(y_s);
+        let x_s = shift(x);
+        let y_s_signed = vreinterpretq_s64_u64(y_s);
+        let x_s_signed = vreinterpretq_s64_u64(x_s);
+        // -1 if underflow (y > x)
+        let mask = vcgtq_s64(y_s_signed, x_s_signed);
+        let wrapback_amt = vshrq_n_u64::<32>(mask);
+        let res_wrapped = vsubq_u64(x_s, y_s);
+        vsubq_u64(res_wrapped, wrapback_amt)
+    }
+}
+
+/// Goldilocks modular negation.
+#[inline]
+fn neg(y: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let y_s = shift(y);
+        vsubq_u64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s))
+    }
+}
+
+/// Halve a vector of Goldilocks field elements.
+#[inline(always)]
+pub(crate) fn halve(input: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let one = vdupq_n_u64(1);
+        let zero = vdupq_n_u64(0);
+        let half = vdupq_n_u64(P.div_ceil(2));
+
+        let least_bit = vandq_u64(input, one);
+        let t = vshrq_n_u64::<1>(input);
+        // neg_least_bit is 0 or -1 (all bits 1)
+        let neg_least_bit = vsubq_u64(zero, least_bit);
+        let maybe_half = vandq_u64(half, neg_least_bit);
+        vaddq_u64(t, maybe_half)
+    }
+}
+
+/// Goldilocks modular multiplication using interleaved dual-lane ASM.
+#[inline]
+fn mul(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let x0 = vgetq_lane_u64::<0>(x);
+        let x1 = vgetq_lane_u64::<1>(x);
+        let y0 = vgetq_lane_u64::<0>(y);
+        let y1 = vgetq_lane_u64::<1>(y);
+
+        let (res_0, res_1) = mul_reduce_dual_asm(x0, y0, x1, y1);
+
+        transmute([res_0, res_1])
+    }
+}
+
+/// Interleaved dual-lane multiplication and reduction using scalar ASM.
+/// Uses shift-based EPSILON multiplication: hi_lo * EPSILON = (hi_lo << 32) - hi_lo
+#[inline(always)]
+unsafe fn mul_reduce_dual_asm(a0: u64, b0: u64, a1: u64, b1: u64) -> (u64, u64) {
+    use core::arch::asm;
+    let result0: u64;
+    let result1: u64;
+
+    unsafe {
+        asm!(
+            // Compute both 128-bit products (interleaved for ILP)
+            "mul   {lo0}, {a0}, {b0}",
+            "mul   {lo1}, {a1}, {b1}",
+            "umulh {hi0}, {a0}, {b0}",
+            "umulh {hi1}, {a1}, {b1}",
+
+            // hi_hi = hi >> 32
+            "lsr   {hi_hi0}, {hi0}, #32",
+            "lsr   {hi_hi1}, {hi1}, #32",
+
+            // tmp = lo - hi_hi (with borrow handling)
+            "subs  {tmp0}, {lo0}, {hi_hi0}",
+            "csetm {adj0:w}, cc",
+            "subs  {tmp1}, {lo1}, {hi_hi1}",
+            "csetm {adj1:w}, cc",
+            "sub   {tmp0}, {tmp0}, {adj0}",
+            "sub   {tmp1}, {tmp1}, {adj1}",
+
+            // hi_lo = hi & EPSILON
+            "and   {hi_lo0}, {hi0}, {epsilon}",
+            "and   {hi_lo1}, {hi1}, {epsilon}",
+
+            // hi_lo_eps = (hi_lo << 32) - hi_lo (avoids multiply)
+            "lsl   {t0}, {hi_lo0}, #32",
+            "lsl   {t1}, {hi_lo1}, #32",
+            "sub   {hi_lo_eps0}, {t0}, {hi_lo0}",
+            "sub   {hi_lo_eps1}, {t1}, {hi_lo1}",
+
+            // result = tmp + hi_lo_eps (with overflow handling)
+            "adds  {result0}, {tmp0}, {hi_lo_eps0}",
+            "csetm {adj0:w}, cs",
+            "adds  {result1}, {tmp1}, {hi_lo_eps1}",
+            "csetm {adj1:w}, cs",
+            "add   {result0}, {result0}, {adj0}",
+            "add   {result1}, {result1}, {adj1}",
+
+            a0 = in(reg) a0,
+            b0 = in(reg) b0,
+            a1 = in(reg) a1,
+            b1 = in(reg) b1,
+            epsilon = in(reg) EPSILON,
+            lo0 = out(reg) _,
+            lo1 = out(reg) _,
+            hi0 = out(reg) _,
+            hi1 = out(reg) _,
+            hi_hi0 = out(reg) _,
+            hi_hi1 = out(reg) _,
+            tmp0 = out(reg) _,
+            tmp1 = out(reg) _,
+            hi_lo0 = out(reg) _,
+            hi_lo1 = out(reg) _,
+            t0 = out(reg) _,
+            t1 = out(reg) _,
+            hi_lo_eps0 = out(reg) _,
+            hi_lo_eps1 = out(reg) _,
+            adj0 = out(reg) _,
+            adj1 = out(reg) _,
+            result0 = out(reg) result0,
+            result1 = out(reg) result1,
+            options(pure, nomem, nostack),
+        );
+    }
+
+    (result0, result1)
+}
+
+/// Goldilocks modular square using interleaved dual-lane ASM.
+#[inline]
+fn square(x: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let x0 = vgetq_lane_u64::<0>(x);
+        let x1 = vgetq_lane_u64::<1>(x);
+
+        let (res_0, res_1) = mul_reduce_dual_asm(x0, x0, x1, x1);
+
+        transmute([res_0, res_1])
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field_testing::test_packed_field;
+
+    use super::{Goldilocks, PackedGoldilocksNeon, WIDTH};
+
+    const SPECIAL_VALS: [Goldilocks; WIDTH] =
+        Goldilocks::new_array([0xFFFF_FFFF_0000_0000, 0xFFFF_FFFF_FFFF_FFFF]);
+
+    const ZEROS: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([
+        0x0000_0000_0000_0000,
+        0xFFFF_FFFF_0000_0001, // = P, canonicalizes to 0
+    ]));
+
+    const ONES: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0002, // = P + 1, canonicalizes to 1
+    ]));
+
+    test_packed_field!(
+        crate::PackedGoldilocksNeon,
+        &[super::ZEROS],
+        &[super::ONES],
+        crate::PackedGoldilocksNeon(super::SPECIAL_VALS)
+    );
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs
new file mode 100644
index 000000000..0a877578a
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs
@@ -0,0 +1,716 @@
+//! Fused Poseidon1 permutation for Goldilocks on aarch64.
+
+use alloc::vec::Vec;
+
+use p3_poseidon1::{
+    FullRoundConstants, PartialRoundConstants, full_round_initial_permute_state,
+    full_round_terminal_permute_state, partial_permute_state,
+};
+use p3_symmetric::{CryptographicPermutation, Permutation};
+
+use super::mds::{MdsNeonGoldilocks, mds_neon_w8, mds_neon_w12};
+use super::packing::PackedGoldilocksNeon;
+use super::poseidon1_asm::*;
+use super::poseidon2_asm::{sbox_layer_asm, sbox_layer_dual_asm};
+use super::utils::{pack_lanes, unpack_lanes};
+use crate::Goldilocks;
+
+/// Fused Poseidon1 permutation for Goldilocks.
+///
+/// Holds the pre-extracted raw `u64` constants from the optimized Poseidon1
+/// sparse-matrix decomposition. Storing raw values avoids field-element
+/// overhead in the hot inner loop.
+#[derive(Clone, Debug)]
+pub struct Poseidon1GoldilocksFused<const WIDTH: usize> {
+    /// Round constants for the initial full rounds (RF/2 vectors).
+    initial_constants_raw: Vec<[u64; WIDTH]>,
+    /// Round constants for the terminal full rounds (RF/2 vectors).
+    terminal_constants_raw: Vec<[u64; WIDTH]>,
+    /// Full-width constant vector for the first partial round.
+    first_round_constants_raw: [u64; WIDTH],
+    /// Dense transition matrix applied once before entering the partial-round loop.
+    m_i_raw: [[u64; WIDTH]; WIDTH],
+    /// Per-round first row of the sparse matrix (one per partial round).
+    sparse_first_row_raw: Vec<[u64; WIDTH]>,
+    /// Per-round sub-diagonal vector for the sparse matmul (one per partial round).
+    v_raw: Vec<[u64; WIDTH]>,
+    /// Scalar round constants for partial rounds 0 through RP-2.
+    ///
+    /// The last partial round has no scalar constant (it ends with the S-box only).
+    round_constants_raw: Vec<u64>,
+}
+
+impl<const WIDTH: usize> Poseidon1GoldilocksFused<WIDTH> {
+    /// Create from pre-computed full and partial round constants.
+    ///
+    /// Extracts the raw `u64` representation from each Goldilocks field
+    /// element, building the flat arrays that the ASM kernels consume.
+    pub fn new(
+        full: &FullRoundConstants<Goldilocks, WIDTH>,
+        partial: &PartialRoundConstants<Goldilocks, WIDTH>,
+    ) -> Self {
+        // Extract raw u64 values from full-round constant matrices.
+        let initial_constants_raw = full
+            .initial
+            .iter()
+            .map(|rc| core::array::from_fn(|i| rc[i].value))
+            .collect();
+        let terminal_constants_raw = full
+            .terminal
+            .iter()
+            .map(|rc| core::array::from_fn(|i| rc[i].value))
+            .collect();
+
+        // Extract the first partial-round constant vector.
+        let first_round_constants_raw =
+            core::array::from_fn(|i| partial.first_round_constants[i].value);
+
+        // Extract the dense transition matrix.
+        let m_i_raw = core::array::from_fn(|i| core::array::from_fn(|j| partial.m_i[i][j].value));
+
+        // Extract per-round sparse matrix data.
+        let sparse_first_row_raw = partial
+            .sparse_first_row
+            .iter()
+            .map(|r| core::array::from_fn(|i| r[i].value))
+            .collect();
+        let v_raw = partial
+            .v
+            .iter()
+            .map(|r| core::array::from_fn(|i| r[i].value))
+            .collect();
+
+        // Extract scalar round constants for partial rounds.
+        let round_constants_raw = partial.round_constants.iter().map(|c| c.value).collect();
+
+        Self {
+            initial_constants_raw,
+            terminal_constants_raw,
+            first_round_constants_raw,
+            m_i_raw,
+            sparse_first_row_raw,
+            v_raw,
+            round_constants_raw,
+        }
+    }
+}
+
+/// Run the initial or terminal full rounds on a raw width-8 state.
+///
+/// Each full round applies: add constants, S-box on all elements, NEON MDS.
+#[inline]
+fn full_rounds_scalar_w8(raw: &mut [u64; 8], constants: &[[u64; 8]]) {
+    for rc in constants {
+        unsafe {
+            add_rc_asm(raw, rc);
+            sbox_layer_asm(raw);
+        }
+        *raw = unsafe { mds_neon_w8(raw) };
+    }
+}
+
+/// Run the initial or terminal full rounds on a raw width-12 state.
+///
+/// Each full round applies: add constants, S-box on all elements, NEON MDS.
+#[inline]
+fn full_rounds_scalar_w12(raw: &mut [u64; 12], constants: &[[u64; 12]]) {
+    for rc in constants {
+        unsafe {
+            add_rc_asm(raw, rc);
+            sbox_layer_asm(raw);
+        }
+        *raw = unsafe { mds_neon_w12(raw) };
+    }
+}
+
+/// Run all partial rounds on a raw width-8 state.
+///
+/// The partial-round sequence is:
+/// 1. Add the first-round full-width constant vector.
+/// 2. Apply the dense transition matrix once.
+/// 3. For each partial round (except the last):
+///    S-box on first element, add scalar constant, sparse matmul.
+/// 4. Last partial round: S-box on first element, sparse matmul (no constant).
+#[inline]
+fn partial_rounds_scalar_w8(
+    raw: &mut [u64; 8],
+    first_rc: &[u64; 8],
+    m_i: &[[u64; 8]; 8],
+    sparse_first_row: &[[u64; 8]],
+    v: &[[u64; 8]],
+    round_constants: &[u64],
+) {
+    // Add the first-round full-width constant vector.
+    unsafe {
+        add_rc_asm(raw, first_rc);
+    }
+
+    // Apply the dense transition matrix once.
+    dense_matmul_asm_w8(raw, m_i);
+
+    // Main partial-round loop: S-box + scalar constant + sparse matmul.
+    let rounds_p = sparse_first_row.len();
+    for r in 0..rounds_p - 1 {
+        unsafe {
+            sbox_s0_asm(raw);
+            add_scalar_s0_asm(raw, round_constants[r]);
+            cheap_matmul_asm_w8(raw, &sparse_first_row[r], &v[r]);
+        }
+    }
+
+    // Last partial round: no scalar constant.
+    unsafe {
+        sbox_s0_asm(raw);
+        cheap_matmul_asm_w8(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]);
+    }
+}
+
+/// Run all partial rounds on a raw width-12 state.
+///
+/// Same structure as the width-8 variant.
+#[inline]
+fn partial_rounds_scalar_w12(
+    raw: &mut [u64; 12],
+    first_rc: &[u64; 12],
+    m_i: &[[u64; 12]; 12],
+    sparse_first_row: &[[u64; 12]],
+    v: &[[u64; 12]],
+    round_constants: &[u64],
+) {
+    unsafe {
+        add_rc_asm(raw, first_rc);
+    }
+    dense_matmul_asm_w12(raw, m_i);
+
+    let rounds_p = sparse_first_row.len();
+    for r in 0..rounds_p - 1 {
+        unsafe {
+            sbox_s0_asm(raw);
+            add_scalar_s0_asm(raw, round_constants[r]);
+            cheap_matmul_asm_w12(raw, &sparse_first_row[r], &v[r]);
+        }
+    }
+    unsafe {
+        sbox_s0_asm(raw);
+        cheap_matmul_asm_w12(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]);
+    }
+}
+
+/// Run the initial or terminal full rounds on two raw width-8 lanes.
+///
+/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane.
+#[inline]
+fn full_rounds_dual_w8(lane0: &mut [u64; 8], lane1: &mut [u64; 8], constants: &[[u64; 8]]) {
+    for rc in constants {
+        unsafe {
+            add_rc_dual_asm(lane0, lane1, rc);
+            sbox_layer_dual_asm(lane0, lane1);
+        }
+        *lane0 = unsafe { mds_neon_w8(lane0) };
+        *lane1 = unsafe { mds_neon_w8(lane1) };
+    }
+}
+
+/// Run the initial or terminal full rounds on two raw width-12 lanes.
+///
+/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane.
+#[inline]
+fn full_rounds_dual_w12(lane0: &mut [u64; 12], lane1: &mut [u64; 12], constants: &[[u64; 12]]) {
+    for rc in constants {
+        unsafe {
+            add_rc_dual_asm(lane0, lane1, rc);
+            sbox_layer_dual_asm(lane0, lane1);
+        }
+        *lane0 = unsafe { mds_neon_w12(lane0) };
+        *lane1 = unsafe { mds_neon_w12(lane1) };
+    }
+}
+
+/// Run all partial rounds on two width-8 lanes simultaneously.
+///
+/// Uses dual-lane S-box and sparse matmul primitives to keep the
+/// pipeline full. The scalar constant is added to each lane separately
+/// (no dual variant needed for a single-element addition).
+#[inline]
+fn partial_rounds_dual_w8(
+    lane0: &mut [u64; 8],
+    lane1: &mut [u64; 8],
+    first_rc: &[u64; 8],
+    m_i: &[[u64; 8]; 8],
+    sparse_first_row: &[[u64; 8]],
+    v: &[[u64; 8]],
+    round_constants: &[u64],
+) {
+    // Add the first-round constant to both lanes.
+    unsafe {
+        add_rc_dual_asm(lane0, lane1, first_rc);
+    }
+
+    // Dense transition matrix on both lanes.
+    dense_matmul_dual_asm_w8(lane0, lane1, m_i);
+
+    // Main partial-round loop.
+    let rounds_p = sparse_first_row.len();
+    for r in 0..rounds_p - 1 {
+        unsafe {
+            sbox_s0_dual_asm(lane0, lane1);
+            add_scalar_s0_asm(lane0, round_constants[r]);
+            add_scalar_s0_asm(lane1, round_constants[r]);
+            cheap_matmul_dual_asm_w8(lane0, lane1, &sparse_first_row[r], &v[r]);
+        }
+    }
+
+    // Last partial round: no scalar constant.
+    unsafe {
+        sbox_s0_dual_asm(lane0, lane1);
+        cheap_matmul_dual_asm_w8(
+            lane0,
+            lane1,
+            &sparse_first_row[rounds_p - 1],
+            &v[rounds_p - 1],
+        );
+    }
+}
+
+/// Run all partial rounds on two width-12 lanes simultaneously.
+///
+/// Same structure as the width-8 dual variant.
+#[inline]
+fn partial_rounds_dual_w12(
+    lane0: &mut [u64; 12],
+    lane1: &mut [u64; 12],
+    first_rc: &[u64; 12],
+    m_i: &[[u64; 12]; 12],
+    sparse_first_row: &[[u64; 12]],
+    v: &[[u64; 12]],
+    round_constants: &[u64],
+) {
+    unsafe {
+        add_rc_dual_asm(lane0, lane1, first_rc);
+    }
+    dense_matmul_dual_asm_w12(lane0, lane1, m_i);
+
+    let rounds_p = sparse_first_row.len();
+    for r in 0..rounds_p - 1 {
+        unsafe {
+            sbox_s0_dual_asm(lane0, lane1);
+            add_scalar_s0_asm(lane0, round_constants[r]);
+            add_scalar_s0_asm(lane1, round_constants[r]);
+            cheap_matmul_dual_asm_w12(lane0, lane1, &sparse_first_row[r], &v[r]);
+        }
+    }
+    unsafe {
+        sbox_s0_dual_asm(lane0, lane1);
+        cheap_matmul_dual_asm_w12(
+            lane0,
+            lane1,
+            &sparse_first_row[rounds_p - 1],
+            &v[rounds_p - 1],
+        );
+    }
+}
+
+impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
+        // Zero-cost transmute: Goldilocks is repr(transparent) over u64.
+        let raw = unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
+
+        // Initial full rounds, then partial rounds, then terminal full rounds.
+        full_rounds_scalar_w8(raw, &self.initial_constants_raw);
+        partial_rounds_scalar_w8(
+            raw,
+            &self.first_round_constants_raw,
+            &self.m_i_raw,
+            &self.sparse_first_row_raw,
+            &self.v_raw,
+            &self.round_constants_raw,
+        );
+        full_rounds_scalar_w8(raw, &self.terminal_constants_raw);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> {}
+
+impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) {
+        // Unpack the two lanes from the packed representation.
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+
+        // Run the full permutation on both lanes simultaneously.
+        full_rounds_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw);
+        partial_rounds_dual_w8(
+            &mut lane0,
+            &mut lane1,
+            &self.first_round_constants_raw,
+            &self.m_i_raw,
+            &self.sparse_first_row_raw,
+            &self.v_raw,
+            &self.round_constants_raw,
+        );
+        full_rounds_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw);
+
+        // Repack both lanes into the packed representation.
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> {}
+
+impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
+        let raw = unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
+
+        full_rounds_scalar_w12(raw, &self.initial_constants_raw);
+        partial_rounds_scalar_w12(
+            raw,
+            &self.first_round_constants_raw,
+            &self.m_i_raw,
+            &self.sparse_first_row_raw,
+            &self.v_raw,
+            &self.round_constants_raw,
+        );
+        full_rounds_scalar_w12(raw, &self.terminal_constants_raw);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> {}
+
+impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+
+        full_rounds_dual_w12(&mut lane0, &mut lane1, &self.initial_constants_raw);
+        partial_rounds_dual_w12(
+            &mut lane0,
+            &mut lane1,
+            &self.first_round_constants_raw,
+            &self.m_i_raw,
+            &self.sparse_first_row_raw,
+            &self.v_raw,
+            &self.round_constants_raw,
+        );
+        full_rounds_dual_w12(&mut lane0, &mut lane1, &self.terminal_constants_raw);
+
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> {}
+
+/// Dual-dispatch wrapper for Goldilocks Poseidon1.
+///
+/// **Scalar** permutations use the NEON-accelerated MDS for full rounds
+/// and LLVM-optimized sparse matrix decomposition for partial rounds.
+/// This avoids sequential inline ASM that would prevent LLVM's
+/// instruction scheduling optimizations on wide out-of-order cores.
+///
+/// **Packed** permutations delegate to the fused dual-lane ASM path
+/// with NEON MDS for full rounds and sparse matrix for partial rounds
+/// (dual-lane interleaving hides multiply latency).
+#[derive(Clone, Debug)]
+pub struct Poseidon1GoldilocksDispatch<const WIDTH: usize> {
+    /// Fused dual-lane path — used for packed permutations.
+    fused: Poseidon1GoldilocksFused<WIDTH>,
+    /// Pre-computed full round constants for NEON MDS.
+    full_constants: FullRoundConstants<Goldilocks, WIDTH>,
+    /// Pre-computed partial round constants (textbook path for scalar, sparse for packed).
+    partial_constants: PartialRoundConstants<Goldilocks, WIDTH>,
+}
+
+impl<const WIDTH: usize> Poseidon1GoldilocksDispatch<WIDTH> {
+    /// Create from fused and pre-computed constants.
+    pub const fn new(
+        fused: Poseidon1GoldilocksFused<WIDTH>,
+        full_constants: FullRoundConstants<Goldilocks, WIDTH>,
+        partial_constants: PartialRoundConstants<Goldilocks, WIDTH>,
+    ) -> Self {
+        Self {
+            fused,
+            full_constants,
+            partial_constants,
+        }
+    }
+}
+
+// --- Width 8 ---
+
+impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
+        let mds = MdsNeonGoldilocks;
+        full_round_initial_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds);
+        partial_permute_state::<_, _, 8, 7>(state, &self.partial_constants);
+        full_round_terminal_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> {}
+
+impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) {
+        self.fused.permute_mut(state);
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> {}
+
+// --- Width 12 ---
+
+impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
+        let mds = MdsNeonGoldilocks;
+        full_round_initial_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds);
+        partial_permute_state::<_, _, 12, 7>(state, &self.partial_constants);
+        full_round_terminal_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> {}
+
+impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) {
+        // Extract both lanes, run the optimized scalar path on each, repack.
+        // Directly inline the scalar logic (NEON MDS full rounds + sparse partial
+        // rounds) to avoid trait-dispatch overhead and enable cross-call inlining.
+        let mut lane0: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[0]);
+        let mut lane1: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[1]);
+
+        let mds = MdsNeonGoldilocks;
+        full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds);
+        partial_permute_state::<_, _, 12, 7>(&mut lane0, &self.partial_constants);
+        full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds);
+
+        full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds);
+        partial_permute_state::<_, _, 12, 7>(&mut lane1, &self.partial_constants);
+        full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds);
+
+        for i in 0..12 {
+            state[i] = PackedGoldilocksNeon([lane0[i], lane1[i]]);
+        }
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> {}
+
+#[cfg(test)]
+mod tests {
+    use p3_field::{PrimeCharacteristicRing, PrimeField64};
+    use p3_poseidon1::Poseidon1Constants;
+    use p3_symmetric::Permutation;
+    use rand::rngs::SmallRng;
+    use rand::{RngExt, SeedableRng};
+
+    use super::*;
+    use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL};
+    use crate::poseidon1::{
+        GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+        GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, GOLDILOCKS_POSEIDON1_RC_8,
+        GOLDILOCKS_POSEIDON1_RC_12, default_goldilocks_poseidon1_8,
+        default_goldilocks_poseidon1_12,
+    };
+
+    type F = Goldilocks;
+
+    /// Build a width-8 fused permutation from the fixed round constants.
+    fn make_fused_w8() -> Poseidon1GoldilocksFused<8> {
+        let raw = Poseidon1Constants {
+            rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+            rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+            mds_circ_col: MATRIX_CIRC_MDS_8_COL,
+            round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(),
+        };
+        let (full, partial) = raw.to_optimized();
+        Poseidon1GoldilocksFused::new(&full, &partial)
+    }
+
+    /// Build a width-12 fused permutation from the fixed round constants.
+    fn make_fused_w12() -> Poseidon1GoldilocksFused<12> {
+        let raw = Poseidon1Constants {
+            rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+            rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12,
+            mds_circ_col: MATRIX_CIRC_MDS_12_COL,
+            round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(),
+        };
+        let (full, partial) = raw.to_optimized();
+        Poseidon1GoldilocksFused::new(&full, &partial)
+    }
+
+    /// Verify that the fused width-8 implementation matches the generic one
+    /// on both zero and random inputs.
+    #[test]
+    fn test_fused_matches_generic_w8() {
+        let generic = default_goldilocks_poseidon1_8();
+        let fused = make_fused_w8();
+        let mut rng = SmallRng::seed_from_u64(42);
+
+        // Zero input.
+        let mut g_state = [F::ZERO; 8];
+        let mut f_state = [F::ZERO; 8];
+        generic.permute_mut(&mut g_state);
+        fused.permute_mut(&mut f_state);
+        for i in 0..8 {
+            assert_eq!(
+                f_state[i].as_canonical_u64(),
+                g_state[i].as_canonical_u64(),
+                "Fused vs generic mismatch at index {i} (zero input, w8)"
+            );
+        }
+
+        // Random input.
+        let mut g_state: [F; 8] = rng.random();
+        let mut f_state = g_state;
+        generic.permute_mut(&mut g_state);
+        fused.permute_mut(&mut f_state);
+        for i in 0..8 {
+            assert_eq!(
+                f_state[i].as_canonical_u64(),
+                g_state[i].as_canonical_u64(),
+                "Fused vs generic mismatch at index {i} (random input, w8)"
+            );
+        }
+    }
+
+    /// Same fused-vs-generic verification for width 12.
+    #[test]
+    fn test_fused_matches_generic_w12() {
+        let generic = default_goldilocks_poseidon1_12();
+        let fused = make_fused_w12();
+        let mut rng = SmallRng::seed_from_u64(42);
+
+        let mut g_state = [F::ZERO; 12];
+        let mut f_state = [F::ZERO; 12];
+        generic.permute_mut(&mut g_state);
+        fused.permute_mut(&mut f_state);
+        for i in 0..12 {
+            assert_eq!(
+                f_state[i].as_canonical_u64(),
+                g_state[i].as_canonical_u64(),
+                "Fused vs generic mismatch at index {i} (zero input, w12)"
+            );
+        }
+
+        let mut g_state: [F; 12] = rng.random();
+        let mut f_state = g_state;
+        generic.permute_mut(&mut g_state);
+        fused.permute_mut(&mut f_state);
+        for i in 0..12 {
+            assert_eq!(
+                f_state[i].as_canonical_u64(),
+                g_state[i].as_canonical_u64(),
+                "Fused vs generic mismatch at index {i} (random input, w12)"
+            );
+        }
+    }
+
+    /// Verify that the packed (dual-lane) width-8 path matches running
+    /// two independent scalar permutations.
+    #[test]
+    fn test_packed_matches_scalar_w8() {
+        let fused = make_fused_w8();
+        let mut rng = SmallRng::seed_from_u64(123);
+
+        // Two independent random scalar inputs.
+        let scalar_a: [F; 8] = rng.random();
+        let scalar_b: [F; 8] = rng.random();
+
+        // Pack them into a single packed state and permute.
+        let mut packed: [PackedGoldilocksNeon; 8] =
+            core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]]));
+        fused.permute_mut(&mut packed);
+
+        // Compute the expected result by running scalar on each independently.
+        let mut expected_a = scalar_a;
+        let mut expected_b = scalar_b;
+        fused.permute_mut(&mut expected_a);
+        fused.permute_mut(&mut expected_b);
+
+        // Lane 0 must match the first scalar, lane 1 must match the second.
+        for i in 0..8 {
+            assert_eq!(
+                packed[i].0[0].as_canonical_u64(),
+                expected_a[i].as_canonical_u64(),
+                "Packed lane0 mismatch at index {i} (w8)"
+            );
+            assert_eq!(
+                packed[i].0[1].as_canonical_u64(),
+                expected_b[i].as_canonical_u64(),
+                "Packed lane1 mismatch at index {i} (w8)"
+            );
+        }
+    }
+
+    /// Same packed-vs-scalar verification for width 12.
+    #[test]
+    fn test_packed_matches_scalar_w12() {
+        let fused = make_fused_w12();
+        let mut rng = SmallRng::seed_from_u64(123);
+
+        let scalar_a: [F; 12] = rng.random();
+        let scalar_b: [F; 12] = rng.random();
+
+        let mut packed: [PackedGoldilocksNeon; 12] =
+            core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]]));
+        fused.permute_mut(&mut packed);
+
+        let mut expected_a = scalar_a;
+        let mut expected_b = scalar_b;
+        fused.permute_mut(&mut expected_a);
+        fused.permute_mut(&mut expected_b);
+
+        for i in 0..12 {
+            assert_eq!(
+                packed[i].0[0].as_canonical_u64(),
+                expected_a[i].as_canonical_u64(),
+                "Packed lane0 mismatch at index {i} (w12)"
+            );
+            assert_eq!(
+                packed[i].0[1].as_canonical_u64(),
+                expected_b[i].as_canonical_u64(),
+                "Packed lane1 mismatch at index {i} (w12)"
+            );
+        }
+    }
+
+    /// Known-answer test for width 8 (sequential 0..7 input).
+    #[test]
+    fn test_fused_kat_w8() {
+        let fused = make_fused_w8();
+        let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
+        fused.permute_mut(&mut input);
+
+        let expected: [F; 8] = F::new_array([
+            2431226948502761687,
+            9427563026145807618,
+            6827549936272051660,
+            16907684411084503785,
+            10131745626715172913,
+            17448305483431576765,
+            9066501914269485014,
+            12095238468458521303,
+        ]);
+        assert_eq!(input, expected);
+    }
+
+    /// Known-answer test for width 12 (sequential 0..11 input).
+    #[test]
+    fn test_fused_kat_w12() {
+        let fused = make_fused_w12();
+        let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
+        fused.permute_mut(&mut input);
+
+        let expected: [F; 12] = F::new_array([
+            15595088881848875364,
+            9564850329150784619,
+            13607005230761744521,
+            12117102595842533385,
+            2814257411756993122,
+            11640647689983397089,
+            14363867760831937423,
+            13323891071259596526,
+            11219803511311150468,
+            9221595262780869902,
+            5898229059046891887,
+            18181291031484020550,
+        ]);
+        assert_eq!(input, expected);
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs
new file mode 100644
index 000000000..3ca1382a9
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs
@@ -0,0 +1,843 @@
+//! ARM assembly primitives for the Poseidon1 permutation over Goldilocks.
+
+use super::utils::{add_asm, mul_add_asm, mul_asm};
+
+// ---------------------------------------------------------------------------
+// S-box: x -> x^7 (applied to the first element only)
+// ---------------------------------------------------------------------------
+
+/// Apply the degree-7 S-box to the first element of the state.
+///
+/// Computes `x^7` using four multiplications via the addition chain:
+///
+/// ```text
+///     x -> x^2 -> x^3 (= x^2 * x)
+///                 x^4 (= x^2 * x^2)
+///                 x^7 (= x^3 * x^4)
+/// ```
+///
+/// Only the first element is modified. All other elements are unchanged.
+/// This corresponds to the non-linear step of a **partial round**.
+#[inline(always)]
+pub unsafe fn sbox_s0_asm(state: &mut [u64]) {
+    unsafe {
+        // Load the first element.
+        let s0 = state[0];
+
+        // Square: x^2.
+        let s0_2 = mul_asm(s0, s0);
+
+        // Cube: x^3 = x^2 * x.
+        let s0_3 = mul_asm(s0_2, s0);
+
+        // Fourth power: x^4 = x^2 * x^2.
+        let s0_4 = mul_asm(s0_2, s0_2);
+
+        // Seventh power: x^7 = x^3 * x^4.
+        state[0] = mul_asm(s0_3, s0_4);
+    }
+}
+
+/// Dual-lane S-box on the first element of two independent states.
+///
+/// Applies the same degree-7 S-box to both first elements. Interleaving
+/// the two chains hides the multiplication latency: while one multiply
+/// retires, the other is already in flight.
+#[inline(always)]
+pub unsafe fn sbox_s0_dual_asm(state0: &mut [u64], state1: &mut [u64]) {
+    unsafe {
+        // Load both first elements.
+        let a = state0[0];
+        let b = state1[0];
+
+        // Square both.
+        let a2 = mul_asm(a, a);
+        let b2 = mul_asm(b, b);
+
+        // Cube both: x^3 = x^2 * x.
+        let a3 = mul_asm(a2, a);
+        let b3 = mul_asm(b2, b);
+
+        // Fourth power both: x^4 = x^2 * x^2.
+        let a4 = mul_asm(a2, a2);
+        let b4 = mul_asm(b2, b2);
+
+        // Seventh power both: x^7 = x^3 * x^4.
+        state0[0] = mul_asm(a3, a4);
+        state1[0] = mul_asm(b3, b4);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Sparse matrix-vector multiply (partial-round linear layer)
+// ---------------------------------------------------------------------------
+
+/// Sparse matrix-vector multiply for a width-8 state.
+///
+/// Implements the partial-round linear layer. The sparse matrix is
+/// encoded as its first row and a sub-diagonal vector:
+///
+/// ```text
+///     new[0]  = dot(first_row, state)            (dot product)
+///     new[i]  = state[i] + state[0] * v[i-1]   (for i >= 1)
+/// ```
+///
+/// The original first element is captured before the dot product
+/// overwrites it. The unrolled form avoids loop overhead and gives
+/// the scheduler maximum freedom to reorder independent multiply-adds.
+#[inline(always)]
+pub unsafe fn cheap_matmul_asm_w8(state: &mut [u64; 8], first_row: &[u64; 8], v: &[u64; 8]) {
+    unsafe {
+        // Capture the original first element before it gets overwritten.
+        let old_s0 = state[0];
+
+        // Dot product: accumulate dot(first_row, state).
+        let mut acc = mul_asm(state[0], first_row[0]);
+        acc = mul_add_asm(state[1], first_row[1], acc);
+        acc = mul_add_asm(state[2], first_row[2], acc);
+        acc = mul_add_asm(state[3], first_row[3], acc);
+        acc = mul_add_asm(state[4], first_row[4], acc);
+        acc = mul_add_asm(state[5], first_row[5], acc);
+        acc = mul_add_asm(state[6], first_row[6], acc);
+        acc = mul_add_asm(state[7], first_row[7], acc);
+
+        // Tail update: each remaining element gets old_first * v[i-1] added.
+        state[1] = mul_add_asm(old_s0, v[0], state[1]);
+        state[2] = mul_add_asm(old_s0, v[1], state[2]);
+        state[3] = mul_add_asm(old_s0, v[2], state[3]);
+        state[4] = mul_add_asm(old_s0, v[3], state[4]);
+        state[5] = mul_add_asm(old_s0, v[4], state[5]);
+        state[6] = mul_add_asm(old_s0, v[5], state[6]);
+        state[7] = mul_add_asm(old_s0, v[6], state[7]);
+
+        // Write the dot-product result into the first slot.
+        state[0] = acc;
+    }
+}
+
+/// Sparse matrix-vector multiply for a width-12 state.
+///
+/// Same decomposition as the width-8 variant:
+/// - Dot product for the new first element.
+/// - Scalar multiply-add for every other element.
+#[inline(always)]
+pub unsafe fn cheap_matmul_asm_w12(state: &mut [u64; 12], first_row: &[u64; 12], v: &[u64; 12]) {
+    unsafe {
+        // Capture the original first element before it gets overwritten.
+        let old_s0 = state[0];
+
+        // Dot product: accumulate dot(first_row, state).
+        let mut acc = mul_asm(state[0], first_row[0]);
+        acc = mul_add_asm(state[1], first_row[1], acc);
+        acc = mul_add_asm(state[2], first_row[2], acc);
+        acc = mul_add_asm(state[3], first_row[3], acc);
+        acc = mul_add_asm(state[4], first_row[4], acc);
+        acc = mul_add_asm(state[5], first_row[5], acc);
+        acc = mul_add_asm(state[6], first_row[6], acc);
+        acc = mul_add_asm(state[7], first_row[7], acc);
+        acc = mul_add_asm(state[8], first_row[8], acc);
+        acc = mul_add_asm(state[9], first_row[9], acc);
+        acc = mul_add_asm(state[10], first_row[10], acc);
+        acc = mul_add_asm(state[11], first_row[11], acc);
+
+        // Tail update: each remaining element gets old_first * v[i-1] added.
+        state[1] = mul_add_asm(old_s0, v[0], state[1]);
+        state[2] = mul_add_asm(old_s0, v[1], state[2]);
+        state[3] = mul_add_asm(old_s0, v[2], state[3]);
+        state[4] = mul_add_asm(old_s0, v[3], state[4]);
+        state[5] = mul_add_asm(old_s0, v[4], state[5]);
+        state[6] = mul_add_asm(old_s0, v[5], state[6]);
+        state[7] = mul_add_asm(old_s0, v[6], state[7]);
+        state[8] = mul_add_asm(old_s0, v[7], state[8]);
+        state[9] = mul_add_asm(old_s0, v[8], state[9]);
+        state[10] = mul_add_asm(old_s0, v[9], state[10]);
+        state[11] = mul_add_asm(old_s0, v[10], state[11]);
+
+        // Write the dot-product result into the first slot.
+        state[0] = acc;
+    }
+}
+
+/// Dual-lane sparse matrix-vector multiply for a width-8 state.
+///
+/// Processes two independent states through the same sparse matrix
+/// simultaneously. Both lanes share the same first-row and sub-diagonal
+/// vectors, since the matrix is fixed for a given partial round.
+///
+/// Interleaving multiply-adds from both lanes keeps the pipeline full.
+#[inline(always)]
+pub unsafe fn cheap_matmul_dual_asm_w8(
+    s0: &mut [u64; 8],
+    s1: &mut [u64; 8],
+    first_row: &[u64; 8],
+    v: &[u64; 8],
+) {
+    unsafe {
+        // Capture the original first elements from both lanes.
+        let old_a = s0[0];
+        let old_b = s1[0];
+
+        // Dot products: one per lane, interleaved.
+        let mut acc_a = mul_asm(s0[0], first_row[0]);
+        let mut acc_b = mul_asm(s1[0], first_row[0]);
+        acc_a = mul_add_asm(s0[1], first_row[1], acc_a);
+        acc_b = mul_add_asm(s1[1], first_row[1], acc_b);
+        acc_a = mul_add_asm(s0[2], first_row[2], acc_a);
+        acc_b = mul_add_asm(s1[2], first_row[2], acc_b);
+        acc_a = mul_add_asm(s0[3], first_row[3], acc_a);
+        acc_b = mul_add_asm(s1[3], first_row[3], acc_b);
+        acc_a = mul_add_asm(s0[4], first_row[4], acc_a);
+        acc_b = mul_add_asm(s1[4], first_row[4], acc_b);
+        acc_a = mul_add_asm(s0[5], first_row[5], acc_a);
+        acc_b = mul_add_asm(s1[5], first_row[5], acc_b);
+        acc_a = mul_add_asm(s0[6], first_row[6], acc_a);
+        acc_b = mul_add_asm(s1[6], first_row[6], acc_b);
+        acc_a = mul_add_asm(s0[7], first_row[7], acc_a);
+        acc_b = mul_add_asm(s1[7], first_row[7], acc_b);
+
+        // Tail updates: both lanes, interleaved.
+        s0[1] = mul_add_asm(old_a, v[0], s0[1]);
+        s1[1] = mul_add_asm(old_b, v[0], s1[1]);
+        s0[2] = mul_add_asm(old_a, v[1], s0[2]);
+        s1[2] = mul_add_asm(old_b, v[1], s1[2]);
+        s0[3] = mul_add_asm(old_a, v[2], s0[3]);
+        s1[3] = mul_add_asm(old_b, v[2], s1[3]);
+        s0[4] = mul_add_asm(old_a, v[3], s0[4]);
+        s1[4] = mul_add_asm(old_b, v[3], s1[4]);
+        s0[5] = mul_add_asm(old_a, v[4], s0[5]);
+        s1[5] = mul_add_asm(old_b, v[4], s1[5]);
+        s0[6] = mul_add_asm(old_a, v[5], s0[6]);
+        s1[6] = mul_add_asm(old_b, v[5], s1[6]);
+        s0[7] = mul_add_asm(old_a, v[6], s0[7]);
+        s1[7] = mul_add_asm(old_b, v[6], s1[7]);
+
+        // Write the dot-product results into the first slots.
+        s0[0] = acc_a;
+        s1[0] = acc_b;
+    }
+}
+
+/// Dual-lane sparse matrix-vector multiply for a width-12 state.
+///
+/// Same as the width-8 dual variant but with 12-element states.
+/// Uses loops instead of full unrolling since width 12 is large
+/// enough that code size matters more than marginal scheduling gains.
+#[inline(always)]
+pub unsafe fn cheap_matmul_dual_asm_w12(
+    s0: &mut [u64; 12],
+    s1: &mut [u64; 12],
+    first_row: &[u64; 12],
+    v: &[u64; 12],
+) {
+    unsafe {
+        // Capture the original first elements from both lanes.
+        let old_a = s0[0];
+        let old_b = s1[0];
+
+        // Dot products: one per lane, interleaved.
+        let mut acc_a = mul_asm(s0[0], first_row[0]);
+        let mut acc_b = mul_asm(s1[0], first_row[0]);
+        for i in 1..12 {
+            acc_a = mul_add_asm(s0[i], first_row[i], acc_a);
+            acc_b = mul_add_asm(s1[i], first_row[i], acc_b);
+        }
+
+        // Tail updates: both lanes.
+        for i in 1..12 {
+            s0[i] = mul_add_asm(old_a, v[i - 1], s0[i]);
+            s1[i] = mul_add_asm(old_b, v[i - 1], s1[i]);
+        }
+
+        // Write the dot-product results into the first slots.
+        s0[0] = acc_a;
+        s1[0] = acc_b;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Dense matrix-vector multiply (full-round linear layer)
+// ---------------------------------------------------------------------------
+
+/// Dense matrix-vector multiply for a width-8 state.
+///
+/// Computes `state = M * state` where M is a full 8x8 MDS matrix
+/// stored in row-major order. Used in the **full rounds** of the
+/// permutation where every element is mixed with every other.
+///
+/// Each output element is the dot product of one matrix row with the
+/// input vector. The input is snapshotted before any writes occur.
+pub fn dense_matmul_asm_w8(state: &mut [u64; 8], m: &[[u64; 8]; 8]) {
+    unsafe {
+        // Snapshot the current state so reads are not clobbered by writes.
+        let input = *state;
+
+        // Compute each output element as a dot product of one matrix
+        // row with the snapshotted input.
+        for i in 0..8 {
+            let mut acc = mul_asm(input[0], m[i][0]);
+            for j in 1..8 {
+                acc = mul_add_asm(input[j], m[i][j], acc);
+            }
+            state[i] = acc;
+        }
+    }
+}
+
+/// Dense matrix-vector multiply for a width-12 state.
+///
+/// Same as the width-8 variant but with a 12×12 MDS matrix.
+pub fn dense_matmul_asm_w12(state: &mut [u64; 12], m: &[[u64; 12]; 12]) {
+    unsafe {
+        // Snapshot the current state.
+        let input = *state;
+
+        // One dot product per output element.
+        for i in 0..12 {
+            let mut acc = mul_asm(input[0], m[i][0]);
+            for j in 1..12 {
+                acc = mul_add_asm(input[j], m[i][j], acc);
+            }
+            state[i] = acc;
+        }
+    }
+}
+
+/// Dual-lane dense matrix-vector multiply for a width-8 state.
+///
+/// Multiplies two independent state vectors by the same 8×8 matrix.
+/// Both lanes share the matrix but have their own input and output.
+///
+/// Interleaving the two dot-product chains per row hides latency.
+pub fn dense_matmul_dual_asm_w8(s0: &mut [u64; 8], s1: &mut [u64; 8], m: &[[u64; 8]; 8]) {
+    unsafe {
+        // Snapshot both input vectors.
+        let in0 = *s0;
+        let in1 = *s1;
+
+        // For each row, compute both dot products in lockstep.
+        for i in 0..8 {
+            let mut a = mul_asm(in0[0], m[i][0]);
+            let mut b = mul_asm(in1[0], m[i][0]);
+            for j in 1..8 {
+                a = mul_add_asm(in0[j], m[i][j], a);
+                b = mul_add_asm(in1[j], m[i][j], b);
+            }
+            s0[i] = a;
+            s1[i] = b;
+        }
+    }
+}
+
+/// Dual-lane dense matrix-vector multiply for a width-12 state.
+///
+/// Same as the width-8 dual variant but with a 12×12 matrix.
+pub fn dense_matmul_dual_asm_w12(s0: &mut [u64; 12], s1: &mut [u64; 12], m: &[[u64; 12]; 12]) {
+    unsafe {
+        // Snapshot both input vectors.
+        let in0 = *s0;
+        let in1 = *s1;
+
+        // For each row, compute both dot products in lockstep.
+        for i in 0..12 {
+            let mut a = mul_asm(in0[0], m[i][0]);
+            let mut b = mul_asm(in1[0], m[i][0]);
+            for j in 1..12 {
+                a = mul_add_asm(in0[j], m[i][j], a);
+                b = mul_add_asm(in1[j], m[i][j], b);
+            }
+            s0[i] = a;
+            s1[i] = b;
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Round-constant addition
+// ---------------------------------------------------------------------------
+
+/// Add round constants to every element of the state.
+///
+/// This is the first step of every Poseidon1 round. Each element
+/// receives its own constant, added in the Goldilocks field.
+///
+/// Generic over the state width to work with both width-8 and width-12.
+#[inline(always)]
+pub unsafe fn add_rc_asm<const WIDTH: usize>(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) {
+    unsafe {
+        // Element-wise modular addition.
+        for i in 0..WIDTH {
+            state[i] = add_asm(state[i], rc[i]);
+        }
+    }
+}
+
+/// Dual-lane round-constant addition.
+///
+/// Adds the same constants to two independent states. Both lanes
+/// share the constants because they are at the same round position.
+#[inline(always)]
+pub unsafe fn add_rc_dual_asm<const WIDTH: usize>(
+    s0: &mut [u64; WIDTH],
+    s1: &mut [u64; WIDTH],
+    rc: &[u64; WIDTH],
+) {
+    unsafe {
+        // Both lanes receive the same constant at each position.
+        for i in 0..WIDTH {
+            s0[i] = add_asm(s0[i], rc[i]);
+            s1[i] = add_asm(s1[i], rc[i]);
+        }
+    }
+}
+
+/// Add a single round constant to the first element only.
+///
+/// Used in partial rounds where only the first element enters the
+/// S-box and thus only needs its own constant added.
+#[inline(always)]
+pub unsafe fn add_scalar_s0_asm(state: &mut [u64], rc: u64) {
+    unsafe {
+        // Only the first element is modified.
+        state[0] = add_asm(state[0], rc);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field::PrimeField64;
+    use proptest::prelude::*;
+    use rand::SeedableRng;
+    use rand::rngs::SmallRng;
+
+    use super::*;
+    use crate::Goldilocks;
+
+    type F = Goldilocks;
+
+    /// Reduce a raw `u64` to its canonical Goldilocks representative.
+    ///
+    /// Wraps the value into a field element and extracts the unique
+    /// representative in `[0, P)`. This is the single source of truth
+    /// for comparing ASM outputs (which may carry unreduced values)
+    /// against field-level references.
+    fn canon(x: u64) -> u64 {
+        F::new(x).as_canonical_u64()
+    }
+
+    proptest! {
+        // ================================================================
+        // S-box: first element raised to the 7th power
+        // ================================================================
+
+        /// Verify the single-lane S-box against a field-level reference.
+        ///
+        /// The reference computes x^7 step by step using field multiplication.
+        /// Only the first element should change; the rest must be untouched.
+        #[test]
+        fn test_sbox_s0_asm(vals in prop::array::uniform8(any::<u64>())) {
+            // Build the expected x^7 using the field multiplication chain.
+            let x = F::new(vals[0]);
+            let x2 = x * x;
+            let x3 = x2 * x;
+            let x4 = x2 * x2;
+            let expected_s0 = (x3 * x4).as_canonical_u64();
+
+            // Run the ASM version on a copy.
+            let mut state = vals;
+            unsafe { sbox_s0_asm(&mut state); }
+
+            // The first element must match x^7.
+            prop_assert_eq!(canon(state[0]), expected_s0);
+
+            // Every other element must be unchanged.
+            for i in 1..8 {
+                prop_assert_eq!(state[i], vals[i]);
+            }
+        }
+
+        /// Verify the dual-lane S-box matches two independent single-lane calls.
+        ///
+        /// Runs the single-lane version on each lane separately as the
+        /// reference, then checks the dual-lane version produces the same.
+        #[test]
+        fn test_sbox_s0_dual_asm(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Build the reference by running single-lane on each lane.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                sbox_s0_asm(&mut ref0);
+                sbox_s0_asm(&mut ref1);
+            }
+
+            // Run the dual-lane version.
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { sbox_s0_dual_asm(&mut s0, &mut s1); }
+
+            // Both first elements must match their reference.
+            prop_assert_eq!(canon(s0[0]), canon(ref0[0]));
+            prop_assert_eq!(canon(s1[0]), canon(ref1[0]));
+
+            // All other elements must be unchanged.
+            for i in 1..8 {
+                prop_assert_eq!(s0[i], vals0[i]);
+                prop_assert_eq!(s1[i], vals1[i]);
+            }
+        }
+
+        // ================================================================
+        // Round-constant addition: element-wise field addition
+        // ================================================================
+
+        /// Verify round-constant addition (width 8) against field addition.
+        ///
+        /// Each element should equal the field sum of the original value
+        /// and its corresponding round constant.
+        #[test]
+        fn test_add_rc_asm_w8(
+            vals in prop::array::uniform8(any::<u64>()),
+            rc in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Build the expected result using field addition.
+            let expected: [u64; 8] = core::array::from_fn(|i| {
+                (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64()
+            });
+
+            // Run the ASM version.
+            let mut state = vals;
+            unsafe { add_rc_asm(&mut state, &rc); }
+
+            // Every element must match.
+            for i in 0..8 {
+                prop_assert_eq!(canon(state[i]), expected[i]);
+            }
+        }
+
+        /// Same verification for width 12.
+        #[test]
+        fn test_add_rc_asm_w12(
+            vals in prop::array::uniform12(any::<u64>()),
+            rc in prop::array::uniform12(any::<u64>()),
+        ) {
+            let expected: [u64; 12] = core::array::from_fn(|i| {
+                (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64()
+            });
+
+            let mut state = vals;
+            unsafe { add_rc_asm(&mut state, &rc); }
+
+            for i in 0..12 {
+                prop_assert_eq!(canon(state[i]), expected[i]);
+            }
+        }
+
+        /// Verify dual-lane round-constant addition (width 8) matches
+        /// two independent single-lane calls.
+        #[test]
+        fn test_add_rc_dual_asm_w8(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+            rc in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Reference: single-lane on each independently.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                add_rc_asm(&mut ref0, &rc);
+                add_rc_asm(&mut ref1, &rc);
+            }
+
+            // Run the dual-lane version.
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); }
+
+            // Both lanes must match their references.
+            for i in 0..8 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+
+        /// Same dual-lane verification for width 12.
+        #[test]
+        fn test_add_rc_dual_asm_w12(
+            vals0 in prop::array::uniform12(any::<u64>()),
+            vals1 in prop::array::uniform12(any::<u64>()),
+            rc in prop::array::uniform12(any::<u64>()),
+        ) {
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                add_rc_asm(&mut ref0, &rc);
+                add_rc_asm(&mut ref1, &rc);
+            }
+
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); }
+
+            for i in 0..12 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+
+        // ================================================================
+        // Scalar addition: first element only
+        // ================================================================
+
+        /// Verify that adding a scalar to the first element matches
+        /// field addition, and that all other elements are untouched.
+        #[test]
+        fn test_add_scalar_s0_asm(vals in prop::array::uniform8(any::<u64>()), rc: u64) {
+            // Expected: field sum of the first element and the constant.
+            let expected_s0 = (F::new(vals[0]) + F::new(rc)).as_canonical_u64();
+
+            // Run the ASM version.
+            let mut state = vals;
+            unsafe { add_scalar_s0_asm(&mut state, rc); }
+
+            // The first element must match.
+            prop_assert_eq!(canon(state[0]), expected_s0);
+
+            // Every other element must be unchanged.
+            for i in 1..8 {
+                prop_assert_eq!(state[i], vals[i]);
+            }
+        }
+
+        // ================================================================
+        // Sparse matrix-vector multiply (partial-round linear layer)
+        //
+        // The sparse matrix decomposes into:
+        //   new[0] = dot(first_row, state)
+        //   new[i] = state[i] + state[0] * v[i-1]   for i >= 1
+        // ================================================================
+
+        /// Verify the width-8 sparse matmul against a field-level reference.
+        ///
+        /// Builds the expected result by computing the dot product and
+        /// the per-element multiply-add using Goldilocks field operations.
+        #[test]
+        fn test_cheap_matmul_asm_w8(
+            vals in prop::array::uniform8(any::<u64>()),
+            first_row in prop::array::uniform8(any::<u64>()),
+            v in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Lift raw values into field elements.
+            let f: [F; 8] = vals.map(F::new);
+            let fr: [F; 8] = first_row.map(F::new);
+            let fv: [F; 8] = v.map(F::new);
+
+            // Capture the original first element.
+            let old_s0 = f[0];
+
+            // Dot product for the new first element.
+            let new_s0: F = (0..8).map(|i| f[i] * fr[i]).sum();
+
+            // Tail update for elements 1..8.
+            let mut expected = f;
+            for i in 1..8 {
+                expected[i] = f[i] + old_s0 * fv[i - 1];
+            }
+            expected[0] = new_s0;
+
+            // Run the ASM version.
+            let mut state = vals;
+            unsafe { cheap_matmul_asm_w8(&mut state, &first_row, &v); }
+
+            // Every element must match.
+            for i in 0..8 {
+                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
+            }
+        }
+
+        /// Same verification for width 12.
+        #[test]
+        fn test_cheap_matmul_asm_w12(
+            vals in prop::array::uniform12(any::<u64>()),
+            first_row in prop::array::uniform12(any::<u64>()),
+            v in prop::array::uniform12(any::<u64>()),
+        ) {
+            let f: [F; 12] = vals.map(F::new);
+            let fr: [F; 12] = first_row.map(F::new);
+            let fv: [F; 12] = v.map(F::new);
+
+            let old_s0 = f[0];
+            let new_s0: F = (0..12).map(|i| f[i] * fr[i]).sum();
+
+            let mut expected = f;
+            for i in 1..12 {
+                expected[i] = f[i] + old_s0 * fv[i - 1];
+            }
+            expected[0] = new_s0;
+
+            let mut state = vals;
+            unsafe { cheap_matmul_asm_w12(&mut state, &first_row, &v); }
+
+            for i in 0..12 {
+                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
+            }
+        }
+
+        /// Verify the width-8 dual-lane sparse matmul matches two
+        /// independent single-lane calls.
+        #[test]
+        fn test_cheap_matmul_dual_asm_w8(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+            first_row in prop::array::uniform8(any::<u64>()),
+            v in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Reference: single-lane on each independently.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                cheap_matmul_asm_w8(&mut ref0, &first_row, &v);
+                cheap_matmul_asm_w8(&mut ref1, &first_row, &v);
+            }
+
+            // Run the dual-lane version.
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { cheap_matmul_dual_asm_w8(&mut s0, &mut s1, &first_row, &v); }
+
+            // Both lanes must match their references.
+            for i in 0..8 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+
+        /// Same dual-lane verification for width 12.
+        #[test]
+        fn test_cheap_matmul_dual_asm_w12(
+            vals0 in prop::array::uniform12(any::<u64>()),
+            vals1 in prop::array::uniform12(any::<u64>()),
+            first_row in prop::array::uniform12(any::<u64>()),
+            v in prop::array::uniform12(any::<u64>()),
+        ) {
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                cheap_matmul_asm_w12(&mut ref0, &first_row, &v);
+                cheap_matmul_asm_w12(&mut ref1, &first_row, &v);
+            }
+
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { cheap_matmul_dual_asm_w12(&mut s0, &mut s1, &first_row, &v); }
+
+            for i in 0..12 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+
+        // ================================================================
+        // Dense matrix-vector multiply (full-round linear layer)
+        // ================================================================
+
+        /// Verify the width-8 dense matmul against a field-level reference.
+        ///
+        /// Each output element is the dot product of one matrix row with
+        /// the input vector. The matrix is fixed from a deterministic seed.
+        #[test]
+        fn test_dense_matmul_asm_w8(vals in prop::array::uniform8(any::<u64>())) {
+            // Fixed matrix from a deterministic seed.
+            let mut rng = SmallRng::seed_from_u64(42);
+            let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng);
+
+            // Reference: standard matrix-vector product using field ops.
+            let f: [F; 8] = vals.map(F::new);
+            let expected: [F; 8] = core::array::from_fn(|i| {
+                (0..8).map(|j| f[j] * F::new(m[i][j])).sum()
+            });
+
+            // Run the ASM version.
+            let mut state = vals;
+            dense_matmul_asm_w8(&mut state, &m);
+
+            // Every element must match.
+            for i in 0..8 {
+                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
+            }
+        }
+
+        /// Same verification for width 12.
+        #[test]
+        fn test_dense_matmul_asm_w12(vals in prop::array::uniform12(any::<u64>())) {
+            let mut rng = SmallRng::seed_from_u64(43);
+            let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng);
+
+            let f: [F; 12] = vals.map(F::new);
+            let expected: [F; 12] = core::array::from_fn(|i| {
+                (0..12).map(|j| f[j] * F::new(m[i][j])).sum()
+            });
+
+            let mut state = vals;
+            dense_matmul_asm_w12(&mut state, &m);
+
+            for i in 0..12 {
+                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
+            }
+        }
+
+        /// Verify the width-8 dual-lane dense matmul matches two
+        /// independent single-lane calls.
+        #[test]
+        fn test_dense_matmul_dual_asm_w8(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Fixed matrix from a deterministic seed.
+            let mut rng = SmallRng::seed_from_u64(44);
+            let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng);
+
+            // Reference: single-lane on each independently.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            dense_matmul_asm_w8(&mut ref0, &m);
+            dense_matmul_asm_w8(&mut ref1, &m);
+
+            // Run the dual-lane version.
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            dense_matmul_dual_asm_w8(&mut s0, &mut s1, &m);
+
+            // Both lanes must match their references.
+            for i in 0..8 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+
+        /// Same dual-lane verification for width 12.
+        #[test]
+        fn test_dense_matmul_dual_asm_w12(
+            vals0 in prop::array::uniform12(any::<u64>()),
+            vals1 in prop::array::uniform12(any::<u64>()),
+        ) {
+            let mut rng = SmallRng::seed_from_u64(45);
+            let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng);
+
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            dense_matmul_asm_w12(&mut ref0, &m);
+            dense_matmul_asm_w12(&mut ref1, &m);
+
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            dense_matmul_dual_asm_w12(&mut s0, &mut s1, &m);
+
+            for i in 0..12 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs
new file mode 100644
index 000000000..cf74b4df8
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs
@@ -0,0 +1,652 @@
+//! Optimized Poseidon2 for Goldilocks on aarch64.
+//!
+//! Uses ARM inline assembly with latency hiding via interleaved S-box/MDS computation.
+//! Fully unrolled internal rounds for W8, W12, W16.
+//!
+//! For packed operations, lanes are extracted to scalar, processed with interleaved
+//! dual-lane ASM, then repacked. This is faster than using PackedGoldilocksNeon
+//! arithmetic directly because the scalar `add_asm` avoids the modular reduction
+//! overhead present in NEON addition.
+
+use alloc::vec::Vec;
+
+use p3_poseidon2::{
+    ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, InternalLayer,
+    InternalLayerConstructor, poseidon2_round_numbers_128,
+};
+use p3_symmetric::{CryptographicPermutation, Permutation};
+use rand::distr::{Distribution, StandardUniform};
+use rand::{Rng, RngExt};
+
+use super::packing::PackedGoldilocksNeon;
+use super::poseidon2_asm::*;
+use super::utils::{pack_lanes, unpack_lanes};
+use crate::{Goldilocks, MATRIX_DIAG_20_GOLDILOCKS};
+
+/// Degree of the chosen permutation polynomial for Goldilocks.
+const GOLDILOCKS_S_BOX_DEGREE: u64 = 7;
+
+/// ASM-optimized internal layer with split-state s0-in-register, pre-converted constants.
+#[derive(Debug, Default, Clone)]
+pub struct Poseidon2InternalLayerGoldilocksAsm {
+    constants_raw: Vec<u64>,
+}
+
+impl InternalLayerConstructor<Goldilocks> for Poseidon2InternalLayerGoldilocksAsm {
+    fn new_from_constants(internal_constants: Vec<Goldilocks>) -> Self {
+        let constants_raw = internal_constants.iter().map(|c| c.value).collect();
+        Self { constants_raw }
+    }
+}
+
+const DIAG_RAW_20: [u64; 20] = {
+    let mut arr = [0u64; 20];
+    let mut i = 0;
+    while i < 20 {
+        arr[i] = MATRIX_DIAG_20_GOLDILOCKS[i].value;
+        i += 1;
+    }
+    arr
+};
+
+impl InternalLayer<Goldilocks, 8, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocksAsm {
+    fn permute_state(&self, state: &mut [Goldilocks; 8]) {
+        let state_raw: &mut [u64; 8] =
+            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
+        internal_permute_state_asm_w8(state_raw, &self.constants_raw);
+    }
+}
+
+impl InternalLayer<Goldilocks, 12, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2InternalLayerGoldilocksAsm
+{
+    fn permute_state(&self, state: &mut [Goldilocks; 12]) {
+        let state_raw: &mut [u64; 12] =
+            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
+        internal_permute_state_asm_w12(state_raw, &self.constants_raw);
+    }
+}
+
+impl InternalLayer<Goldilocks, 16, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2InternalLayerGoldilocksAsm
+{
+    fn permute_state(&self, state: &mut [Goldilocks; 16]) {
+        let state_raw: &mut [u64; 16] =
+            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
+        internal_permute_state_asm_w16(state_raw, &self.constants_raw);
+    }
+}
+
+impl InternalLayer<Goldilocks, 20, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2InternalLayerGoldilocksAsm
+{
+    fn permute_state(&self, state: &mut [Goldilocks; 20]) {
+        let state_raw: &mut [u64; 20] =
+            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
+        internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.constants_raw);
+    }
+}
+
+#[derive(Clone)]
+pub struct Poseidon2ExternalLayerGoldilocksAsm<const WIDTH: usize> {
+    initial_constants_raw: Vec<[u64; WIDTH]>,
+    terminal_constants_raw: Vec<[u64; WIDTH]>,
+}
+
+impl<const WIDTH: usize> ExternalLayerConstructor<Goldilocks, WIDTH>
+    for Poseidon2ExternalLayerGoldilocksAsm<WIDTH>
+{
+    fn new_from_constants(external_constants: ExternalLayerConstants<Goldilocks, WIDTH>) -> Self {
+        let initial_constants_raw = external_constants
+            .get_initial_constants()
+            .iter()
+            .map(|rc| core::array::from_fn(|i| rc[i].value))
+            .collect();
+        let terminal_constants_raw = external_constants
+            .get_terminal_constants()
+            .iter()
+            .map(|rc| core::array::from_fn(|i| rc[i].value))
+            .collect();
+        Self {
+            initial_constants_raw,
+            terminal_constants_raw,
+        }
+    }
+}
+
+impl ExternalLayer<Goldilocks, 8, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<8>
+{
+    fn permute_state_initial(&self, state: &mut [Goldilocks; 8]) {
+        let state_raw: &mut [u64; 8] =
+            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
+        external_initial_permute_w8(state_raw, &self.initial_constants_raw);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [Goldilocks; 8]) {
+        let state_raw: &mut [u64; 8] =
+            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
+        external_terminal_permute_w8(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+impl ExternalLayer<Goldilocks, 12, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<12>
+{
+    fn permute_state_initial(&self, state: &mut [Goldilocks; 12]) {
+        let state_raw: &mut [u64; 12] =
+            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
+        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [Goldilocks; 12]) {
+        let state_raw: &mut [u64; 12] =
+            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
+        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+impl ExternalLayer<Goldilocks, 16, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<16>
+{
+    fn permute_state_initial(&self, state: &mut [Goldilocks; 16]) {
+        let state_raw: &mut [u64; 16] =
+            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
+        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [Goldilocks; 16]) {
+        let state_raw: &mut [u64; 16] =
+            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
+        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+impl ExternalLayer<Goldilocks, 20, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<20>
+{
+    fn permute_state_initial(&self, state: &mut [Goldilocks; 20]) {
+        let state_raw: &mut [u64; 20] =
+            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
+        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [Goldilocks; 20]) {
+        let state_raw: &mut [u64; 20] =
+            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
+        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+/// Type alias for scalar ASM-optimized Poseidon2.
+pub type Poseidon2GoldilocksAsm<const WIDTH: usize> = p3_poseidon2::Poseidon2<
+    Goldilocks,
+    Poseidon2ExternalLayerGoldilocksAsm<WIDTH>,
+    Poseidon2InternalLayerGoldilocksAsm,
+    WIDTH,
+    GOLDILOCKS_S_BOX_DEGREE,
+>;
+
+impl InternalLayer<PackedGoldilocksNeon, 8, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2InternalLayerGoldilocksAsm
+{
+    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 8]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl InternalLayer<PackedGoldilocksNeon, 12, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2InternalLayerGoldilocksAsm
+{
+    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 12]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        internal_permute_split_dual_w12(&mut lane0, &mut lane1, &self.constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl InternalLayer<PackedGoldilocksNeon, 16, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2InternalLayerGoldilocksAsm
+{
+    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 16]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        internal_permute_split_dual_w16(&mut lane0, &mut lane1, &self.constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl InternalLayer<PackedGoldilocksNeon, 20, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2InternalLayerGoldilocksAsm
+{
+    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 20]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        internal_permute_split_dual(&mut lane0, &mut lane1, &DIAG_RAW_20, &self.constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl ExternalLayer<PackedGoldilocksNeon, 8, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<8>
+{
+    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 8]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 8]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl ExternalLayer<PackedGoldilocksNeon, 12, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<12>
+{
+    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 12]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 12]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl ExternalLayer<PackedGoldilocksNeon, 16, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<16>
+{
+    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 16]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 16]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl ExternalLayer<PackedGoldilocksNeon, 20, GOLDILOCKS_S_BOX_DEGREE>
+    for Poseidon2ExternalLayerGoldilocksAsm<20>
+{
+    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 20]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+
+    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 20]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+/// Fused Poseidon2 permutation for Goldilocks.
+///
+/// Instead of unpacking/packing between each of the 3 phases (initial external,
+/// internal, terminal external), this performs a single unpack at the start and
+/// a single pack at the end, eliminating the redundant lane conversions per
+/// packed permutation.
+#[derive(Clone, Debug)]
+pub struct Poseidon2GoldilocksFused<const WIDTH: usize> {
+    internal_constants_raw: Vec<u64>,
+    initial_constants_raw: Vec<[u64; WIDTH]>,
+    terminal_constants_raw: Vec<[u64; WIDTH]>,
+}
+
+impl<const WIDTH: usize> Poseidon2GoldilocksFused<WIDTH> {
+    pub fn new(
+        external_constants: &ExternalLayerConstants<Goldilocks, WIDTH>,
+        internal_constants: &[Goldilocks],
+    ) -> Self {
+        let internal_constants_raw = internal_constants.iter().map(|c| c.value).collect();
+        let initial_constants_raw = external_constants
+            .get_initial_constants()
+            .iter()
+            .map(|rc| core::array::from_fn(|i| rc[i].value))
+            .collect();
+        let terminal_constants_raw = external_constants
+            .get_terminal_constants()
+            .iter()
+            .map(|rc| core::array::from_fn(|i| rc[i].value))
+            .collect();
+        Self {
+            internal_constants_raw,
+            initial_constants_raw,
+            terminal_constants_raw,
+        }
+    }
+
+    pub fn new_from_rng<R: Rng>(rounds_f: usize, rounds_p: usize, rng: &mut R) -> Self
+    where
+        StandardUniform: Distribution<Goldilocks> + Distribution<[Goldilocks; WIDTH]>,
+    {
+        let external_constants = ExternalLayerConstants::new_from_rng(rounds_f, rng);
+        let internal_constants = rng
+            .sample_iter(StandardUniform)
+            .take(rounds_p)
+            .collect::<Vec<_>>();
+        Self::new(&external_constants, &internal_constants)
+    }
+
+    pub fn new_from_rng_128<R: Rng>(rng: &mut R) -> Self
+    where
+        StandardUniform: Distribution<Goldilocks> + Distribution<[Goldilocks; WIDTH]>,
+    {
+        let round_numbers =
+            poseidon2_round_numbers_128::<Goldilocks>(WIDTH, GOLDILOCKS_S_BOX_DEGREE);
+        let (rounds_f, rounds_p) = round_numbers.unwrap_or_else(|e| panic!("{e}"));
+        Self::new_from_rng(rounds_f, rounds_p, rng)
+    }
+}
+
+impl Permutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
+        let state_raw: &mut [u64; 8] =
+            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
+        external_initial_permute_w8(state_raw, &self.initial_constants_raw);
+        internal_permute_state_asm_w8(state_raw, &self.internal_constants_raw);
+        external_terminal_permute_w8(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> {}
+
+impl Permutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
+        let state_raw: &mut [u64; 12] =
+            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
+        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
+        internal_permute_state_asm_w12(state_raw, &self.internal_constants_raw);
+        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> {}
+
+impl Permutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 16]) {
+        let state_raw: &mut [u64; 16] =
+            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
+        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
+        internal_permute_state_asm_w16(state_raw, &self.internal_constants_raw);
+        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> {}
+
+impl Permutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> {
+    fn permute_mut(&self, state: &mut [Goldilocks; 20]) {
+        let state_raw: &mut [u64; 20] =
+            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
+        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
+        internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.internal_constants_raw);
+        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
+    }
+}
+
+impl CryptographicPermutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> {}
+
+impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw);
+        internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.internal_constants_raw);
+        external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> {}
+
+impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        let mut sv = lanes_to_neon(&lane0, &lane1);
+        external_initial_neon(&mut sv, &self.initial_constants_raw);
+        internal_permute_neon_w12(&mut sv, &self.internal_constants_raw);
+        external_terminal_neon(&mut sv, &self.terminal_constants_raw);
+        neon_to_lanes(&sv, &mut lane0, &mut lane1);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> {}
+
+impl Permutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 16]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        let mut sv = lanes_to_neon(&lane0, &lane1);
+        external_initial_neon(&mut sv, &self.initial_constants_raw);
+        internal_permute_neon_w16(&mut sv, &self.internal_constants_raw);
+        external_terminal_neon(&mut sv, &self.terminal_constants_raw);
+        neon_to_lanes(&sv, &mut lane0, &mut lane1);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> {}
+
+impl Permutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> {
+    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 20]) {
+        let (mut lane0, mut lane1) = unpack_lanes(state);
+        let mut sv = lanes_to_neon(&lane0, &lane1);
+        external_initial_neon(&mut sv, &self.initial_constants_raw);
+        internal_permute_neon(&mut sv, &DIAG_RAW_20, &self.internal_constants_raw);
+        external_terminal_neon(&mut sv, &self.terminal_constants_raw);
+        neon_to_lanes(&sv, &mut lane0, &mut lane1);
+        pack_lanes(state, &lane0, &lane1);
+    }
+}
+
+impl CryptographicPermutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> {}
+
+#[cfg(test)]
+mod tests {
+    use p3_field::{PrimeCharacteristicRing, PrimeField64};
+    use p3_poseidon2::{ExternalLayerConstants, InternalLayer, Poseidon2};
+    use p3_symmetric::Permutation;
+    use rand::rngs::SmallRng;
+    use rand::{RngExt, SeedableRng};
+
+    use super::*;
+    use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE;
+    use crate::{
+        GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8,
+        Poseidon2ExternalLayerGoldilocks, Poseidon2InternalLayerGoldilocks,
+    };
+
+    type F = Goldilocks;
+
+    // Test that fully ASM-optimized implementation matches generic scalar
+    fn test_asm_matches_generic<const WIDTH: usize>()
+    where
+        Poseidon2InternalLayerGoldilocks: InternalLayer<F, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
+        Poseidon2InternalLayerGoldilocksAsm: InternalLayer<F, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
+        Poseidon2ExternalLayerGoldilocksAsm<WIDTH>:
+            ExternalLayer<Goldilocks, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
+    {
+        let mut rng = SmallRng::seed_from_u64(42);
+
+        let external_constants = ExternalLayerConstants::<Goldilocks, WIDTH>::new_from_rng(
+            2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS,
+            &mut rng,
+        );
+        let internal_constants: Vec<Goldilocks> = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8)
+            .map(|_| F::from_u64(rng.random()))
+            .collect();
+
+        // Generic scalar implementation
+        let generic_poseidon2: Poseidon2<
+            Goldilocks,
+            Poseidon2ExternalLayerGoldilocks<WIDTH>,
+            Poseidon2InternalLayerGoldilocks,
+            WIDTH,
+            GOLDILOCKS_S_BOX_DEGREE,
+        > = Poseidon2::new(external_constants.clone(), internal_constants.clone());
+
+        // Fully ASM-optimized implementation
+        let asm_poseidon2: Poseidon2GoldilocksAsm<WIDTH> =
+            Poseidon2::new(external_constants, internal_constants);
+
+        // Test with zeros
+        let mut generic_input = [F::ZERO; WIDTH];
+        let mut asm_input = [F::ZERO; WIDTH];
+
+        generic_poseidon2.permute_mut(&mut generic_input);
+        asm_poseidon2.permute_mut(&mut asm_input);
+
+        for i in 0..WIDTH {
+            assert_eq!(
+                asm_input[i].as_canonical_u64(),
+                generic_input[i].as_canonical_u64(),
+                "ASM mismatch at index {i} for zero input"
+            );
+        }
+
+        // Test with random input
+        let mut generic_input: [F; WIDTH] = core::array::from_fn(|_| F::from_u64(rng.random()));
+        let mut asm_input = generic_input;
+
+        generic_poseidon2.permute_mut(&mut generic_input);
+        asm_poseidon2.permute_mut(&mut asm_input);
+
+        for i in 0..WIDTH {
+            assert_eq!(
+                asm_input[i].as_canonical_u64(),
+                generic_input[i].as_canonical_u64(),
+                "ASM mismatch at index {i} for random input"
+            );
+        }
+    }
+
+    fn test_fused_matches_generic<const WIDTH: usize>()
+    where
+        Poseidon2InternalLayerGoldilocks: InternalLayer<F, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
+        Poseidon2GoldilocksFused<WIDTH>:
+            Permutation<[F; WIDTH]> + Permutation<[PackedGoldilocksNeon; WIDTH]>,
+    {
+        let mut rng = SmallRng::seed_from_u64(42);
+
+        let external_constants = ExternalLayerConstants::<Goldilocks, WIDTH>::new_from_rng(
+            2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS,
+            &mut rng,
+        );
+        let internal_constants: Vec<Goldilocks> = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8)
+            .map(|_| rng.random())
+            .collect();
+
+        let generic_poseidon2: Poseidon2<
+            Goldilocks,
+            Poseidon2ExternalLayerGoldilocks<WIDTH>,
+            Poseidon2InternalLayerGoldilocks,
+            WIDTH,
+            GOLDILOCKS_S_BOX_DEGREE,
+        > = Poseidon2::new(external_constants.clone(), internal_constants.clone());
+
+        let fused =
+            Poseidon2GoldilocksFused::<WIDTH>::new(&external_constants, &internal_constants);
+
+        // Scalar: fused vs generic
+        let mut generic_input = [F::ZERO; WIDTH];
+        let mut fused_input = [F::ZERO; WIDTH];
+        generic_poseidon2.permute_mut(&mut generic_input);
+        fused.permute_mut(&mut fused_input);
+        for i in 0..WIDTH {
+            assert_eq!(
+                fused_input[i].as_canonical_u64(),
+                generic_input[i].as_canonical_u64(),
+                "Fused scalar mismatch at index {i} for zero input"
+            );
+        }
+
+        let mut generic_input: [F; WIDTH] = rng.random();
+        let mut fused_input = generic_input;
+        generic_poseidon2.permute_mut(&mut generic_input);
+        fused.permute_mut(&mut fused_input);
+        for i in 0..WIDTH {
+            assert_eq!(
+                fused_input[i].as_canonical_u64(),
+                generic_input[i].as_canonical_u64(),
+                "Fused scalar mismatch at index {i} for random input"
+            );
+        }
+
+        // Packed: fused packed vs scalar (each packed lane should match scalar)
+        let scalar_a: [F; WIDTH] = rng.random();
+        let scalar_b: [F; WIDTH] = rng.random();
+
+        let mut packed_input: [PackedGoldilocksNeon; WIDTH] =
+            core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]]));
+        fused.permute_mut(&mut packed_input);
+
+        let mut expected_a = scalar_a;
+        let mut expected_b = scalar_b;
+        fused.permute_mut(&mut expected_a);
+        fused.permute_mut(&mut expected_b);
+
+        for i in 0..WIDTH {
+            assert_eq!(
+                packed_input[i].0[0].as_canonical_u64(),
+                expected_a[i].as_canonical_u64(),
+                "Fused packed lane0 mismatch at index {i}"
+            );
+            assert_eq!(
+                packed_input[i].0[1].as_canonical_u64(),
+                expected_b[i].as_canonical_u64(),
+                "Fused packed lane1 mismatch at index {i}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_asm_matches_generic_width_8() {
+        test_asm_matches_generic::<8>();
+    }
+
+    #[test]
+    fn test_asm_matches_generic_width_12() {
+        test_asm_matches_generic::<12>();
+    }
+
+    #[test]
+    fn test_asm_matches_generic_width_16() {
+        test_asm_matches_generic::<16>();
+    }
+
+    #[test]
+    fn test_asm_matches_generic_width_20() {
+        test_asm_matches_generic::<20>();
+    }
+
+    #[test]
+    fn test_fused_matches_generic_width_8() {
+        test_fused_matches_generic::<8>();
+    }
+
+    #[test]
+    fn test_fused_matches_generic_width_12() {
+        test_fused_matches_generic::<12>();
+    }
+
+    #[test]
+    fn test_fused_matches_generic_width_16() {
+        test_fused_matches_generic::<16>();
+    }
+
+    #[test]
+    fn test_fused_matches_generic_width_20() {
+        test_fused_matches_generic::<20>();
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs
new file mode 100644
index 000000000..00b7fdc57
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs
@@ -0,0 +1,2621 @@
+//! ARM assembly primitives for Poseidon2 on Goldilocks.
+//!
+//! Latency hiding: ARM mul/umulh have ~4-5 cycle latency. By interleaving
+//! S-box computation with MDS operations, we hide much of this latency.
+
+use core::arch::aarch64::*;
+use core::arch::asm;
+
+use super::utils::{add_asm, mul_add_asm, mul_asm};
+use crate::P;
+
+/// Compute x / 2 in the Goldilocks field, matching `halve_u64::<P>`.
+#[inline(always)]
+unsafe fn div2_asm(x: u64) -> u64 {
+    let shift = (P + 1) >> 1;
+    let result: u64;
+    let _tmp: u64;
+
+    unsafe {
+        asm!(
+            // result = x >> 1
+            "lsr   {result}, {x}, #1",
+            // tmp = x & 1
+            "and   {tmp}, {x}, #1",
+            // if tmp != 0 (x odd), tmp := shift, else tmp := 0
+            "cmp   {tmp}, #0",
+            "csel  {tmp}, {shift}, xzr, ne",
+            // result += tmp
+            "add   {result}, {result}, {tmp}",
+            x      = in(reg) x,
+            shift  = in(reg) shift,
+            tmp    = out(reg) _tmp,
+            result = out(reg) result,
+            options(pure, nomem, nostack),
+        );
+    }
+
+    result
+}
+
+#[inline(always)]
+unsafe fn div4_asm(x: u64) -> u64 {
+    unsafe { div2_asm(div2_asm(x)) }
+}
+
+#[inline(always)]
+unsafe fn div8_asm(x: u64) -> u64 {
+    unsafe { div2_asm(div4_asm(x)) }
+}
+
+#[inline(always)]
+unsafe fn div16_asm(x: u64) -> u64 {
+    unsafe { div2_asm(div8_asm(x)) }
+}
+
+#[inline(always)]
+unsafe fn div32_asm(x: u64) -> u64 {
+    unsafe { div4_asm(div8_asm(x)) }
+}
+
+/// Compute x * 2^{-32} mod P using the Goldilocks structure.
+///
+/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P).
+/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P,
+/// where x_hi = x >> 32, x_lo = x & 0xFFFFFFFF.
+#[inline(always)]
+unsafe fn div_2_32_asm(x: u64) -> u64 {
+    let result: u64;
+    let _hi: u64;
+    let _lo: u64;
+    let _t: u64;
+    let _sum: u64;
+    let _adj: u64;
+
+    unsafe {
+        asm!(
+            "lsr   {hi}, {x}, #32",
+            "and   {lo}, {x}, #0xFFFFFFFF",
+            "add   {sum}, {hi}, {lo}",
+            "lsl   {t}, {lo}, #32",
+            "subs  {result}, {sum}, {t}",
+            "csetm {adj:w}, cc",
+            "sub   {result}, {result}, {adj}",
+            x      = in(reg) x,
+            hi     = out(reg) _hi,
+            lo     = out(reg) _lo,
+            t      = out(reg) _t,
+            sum    = out(reg) _sum,
+            result = out(reg) result,
+            adj    = lateout(reg) _adj,
+            options(pure, nomem, nostack),
+        );
+    }
+
+    result
+}
+
+/// Subtract two Goldilocks elements with borrow handling using inline assembly.
+#[inline(always)]
+unsafe fn sub_asm(a: u64, b: u64) -> u64 {
+    let result: u64;
+    let _adj: u64;
+
+    unsafe {
+        asm!(
+            "subs  {result}, {a}, {b}",
+            "csetm {adj:w}, cc",
+            "sub   {result}, {result}, {adj}",
+            a = in(reg) a,
+            b = in(reg) b,
+            result = out(reg) result,
+            adj = out(reg) _adj,
+            options(pure, nomem, nostack),
+        );
+    }
+
+    result
+}
+
+/// Split-state generic internal permute: s0 stays in a register across all rounds.
+#[inline]
+#[allow(clippy::needless_range_loop)]
+pub fn internal_permute_state_asm<const WIDTH: usize>(
+    state: &mut [u64; WIDTH],
+    diag: &[u64; WIDTH],
+    constants: &[u64],
+) {
+    let mut s0 = state[0];
+    for &rc in constants {
+        unsafe {
+            s0 = add_asm(s0, rc);
+            let s0_2 = mul_asm(s0, s0);
+            let s0_3 = mul_asm(s0_2, s0);
+            let s0_4 = mul_asm(s0_2, s0_2);
+            s0 = mul_asm(s0_3, s0_4);
+
+            let mut sum_hi: u64 = 0;
+            for i in 1..WIDTH {
+                sum_hi = add_asm(sum_hi, state[i]);
+            }
+
+            let mut diag_muls: [u64; WIDTH] = [0; WIDTH];
+            for i in 1..WIDTH {
+                diag_muls[i] = mul_asm(state[i], diag[i]);
+            }
+
+            let sum = add_asm(sum_hi, s0);
+            s0 = mul_add_asm(s0, diag[0], sum);
+
+            for i in 1..WIDTH {
+                state[i] = add_asm(diag_muls[i], sum);
+            }
+        }
+    }
+    state[0] = s0;
+}
+
+/// Split-state generic dual-lane internal permute for packed processing.
+#[inline]
+#[allow(clippy::needless_range_loop)]
+pub fn internal_permute_split_dual<const WIDTH: usize>(
+    lane0: &mut [u64; WIDTH],
+    lane1: &mut [u64; WIDTH],
+    diag: &[u64; WIDTH],
+    constants: &[u64],
+) {
+    let mut s0_a = lane0[0];
+    let mut s0_b = lane1[0];
+    for &rc in constants {
+        unsafe {
+            s0_a = add_asm(s0_a, rc);
+            s0_b = add_asm(s0_b, rc);
+            let s0_2_a = mul_asm(s0_a, s0_a);
+            let s0_2_b = mul_asm(s0_b, s0_b);
+            let s0_3_a = mul_asm(s0_2_a, s0_a);
+            let s0_3_b = mul_asm(s0_2_b, s0_b);
+            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
+            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
+            s0_a = mul_asm(s0_3_a, s0_4_a);
+            s0_b = mul_asm(s0_3_b, s0_4_b);
+
+            let mut sum_hi_a: u64 = 0;
+            let mut sum_hi_b: u64 = 0;
+            for i in 1..WIDTH {
+                sum_hi_a = add_asm(sum_hi_a, lane0[i]);
+                sum_hi_b = add_asm(sum_hi_b, lane1[i]);
+            }
+
+            let mut diag_muls_a: [u64; WIDTH] = [0; WIDTH];
+            let mut diag_muls_b: [u64; WIDTH] = [0; WIDTH];
+            for i in 1..WIDTH {
+                diag_muls_a[i] = mul_asm(lane0[i], diag[i]);
+                diag_muls_b[i] = mul_asm(lane1[i], diag[i]);
+            }
+
+            let sum_a = add_asm(sum_hi_a, s0_a);
+            let sum_b = add_asm(sum_hi_b, s0_b);
+            s0_a = mul_add_asm(s0_a, diag[0], sum_a);
+            s0_b = mul_add_asm(s0_b, diag[0], sum_b);
+
+            for i in 1..WIDTH {
+                lane0[i] = add_asm(diag_muls_a[i], sum_a);
+                lane1[i] = add_asm(diag_muls_b[i], sum_b);
+            }
+        }
+    }
+    lane0[0] = s0_a;
+    lane1[0] = s0_b;
+}
+
+/// Split-state W8 internal permute: s0 stays in a register across all rounds.
+#[inline]
+pub fn internal_permute_state_asm_w8(state: &mut [u64; 8], constants: &[u64]) {
+    let mut s0 = state[0];
+    for &rc in constants {
+        unsafe {
+            s0 = add_asm(s0, rc);
+            let s0_2 = mul_asm(s0, s0);
+
+            let sum1 = add_asm(state[1], state[2]);
+            let sum2 = add_asm(state[3], state[4]);
+            let sum3 = add_asm(state[5], state[6]);
+
+            let s0_3 = mul_asm(s0_2, s0);
+            let s0_4 = mul_asm(s0_2, s0_2);
+
+            let sum12 = add_asm(sum1, sum2);
+            let sum37 = add_asm(sum3, state[7]);
+
+            let d1 = state[1];
+            let d2 = double_asm(state[2]);
+            let d3 = div2_asm(state[3]);
+            let d4 = add_asm(double_asm(state[4]), state[4]);
+
+            let sum_hi = add_asm(sum12, sum37);
+
+            let d5 = div2_asm(state[5]);
+            let d6 = add_asm(double_asm(state[6]), state[6]);
+            let d7 = double_asm(double_asm(state[7]));
+
+            s0 = mul_asm(s0_3, s0_4);
+            let sum = add_asm(sum_hi, s0);
+            // V[0]=-2: new_s0 = sum + (-2)*s0 = sum_hi + s0 - 2*s0 = sum_hi - s0
+            s0 = sub_asm(sum_hi, s0);
+
+            state[1] = add_asm(d1, sum);
+            state[2] = add_asm(d2, sum);
+            state[3] = add_asm(d3, sum);
+            state[4] = add_asm(d4, sum);
+            state[5] = sub_asm(sum, d5);
+            state[6] = sub_asm(sum, d6);
+            state[7] = sub_asm(sum, d7);
+        }
+    }
+    state[0] = s0;
+}
+
+/// Split-state dual-lane W8 internal permute for packed processing.
+#[inline]
+pub fn internal_permute_split_dual_w8(
+    lane0: &mut [u64; 8],
+    lane1: &mut [u64; 8],
+    constants: &[u64],
+) {
+    let mut s0_a = lane0[0];
+    let mut s0_b = lane1[0];
+    for &rc in constants {
+        unsafe {
+            s0_a = add_asm(s0_a, rc);
+            s0_b = add_asm(s0_b, rc);
+
+            let s0_2_a = mul_asm(s0_a, s0_a);
+            let s0_2_b = mul_asm(s0_b, s0_b);
+
+            let sum1_a = add_asm(lane0[1], lane0[2]);
+            let sum1_b = add_asm(lane1[1], lane1[2]);
+            let sum2_a = add_asm(lane0[3], lane0[4]);
+            let sum2_b = add_asm(lane1[3], lane1[4]);
+            let sum3_a = add_asm(lane0[5], lane0[6]);
+            let sum3_b = add_asm(lane1[5], lane1[6]);
+
+            let s0_3_a = mul_asm(s0_2_a, s0_a);
+            let s0_3_b = mul_asm(s0_2_b, s0_b);
+            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
+            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
+
+            let sum12_a = add_asm(sum1_a, sum2_a);
+            let sum12_b = add_asm(sum1_b, sum2_b);
+            let sum37_a = add_asm(sum3_a, lane0[7]);
+            let sum37_b = add_asm(sum3_b, lane1[7]);
+
+            let d1_a = lane0[1];
+            let d1_b = lane1[1];
+            let d2_a = double_asm(lane0[2]);
+            let d2_b = double_asm(lane1[2]);
+            let d3_a = div2_asm(lane0[3]);
+            let d3_b = div2_asm(lane1[3]);
+            let d4_a = add_asm(double_asm(lane0[4]), lane0[4]);
+            let d4_b = add_asm(double_asm(lane1[4]), lane1[4]);
+
+            let sum_hi_a = add_asm(sum12_a, sum37_a);
+            let sum_hi_b = add_asm(sum12_b, sum37_b);
+
+            let d5_a = div2_asm(lane0[5]);
+            let d5_b = div2_asm(lane1[5]);
+            let d6_a = add_asm(double_asm(lane0[6]), lane0[6]);
+            let d6_b = add_asm(double_asm(lane1[6]), lane1[6]);
+            let d7_a = double_asm(double_asm(lane0[7]));
+            let d7_b = double_asm(double_asm(lane1[7]));
+
+            s0_a = mul_asm(s0_3_a, s0_4_a);
+            s0_b = mul_asm(s0_3_b, s0_4_b);
+
+            let sum_a = add_asm(sum_hi_a, s0_a);
+            let sum_b = add_asm(sum_hi_b, s0_b);
+            s0_a = sub_asm(sum_hi_a, s0_a);
+            s0_b = sub_asm(sum_hi_b, s0_b);
+
+            lane0[1] = add_asm(d1_a, sum_a);
+            lane1[1] = add_asm(d1_b, sum_b);
+            lane0[2] = add_asm(d2_a, sum_a);
+            lane1[2] = add_asm(d2_b, sum_b);
+            lane0[3] = add_asm(d3_a, sum_a);
+            lane1[3] = add_asm(d3_b, sum_b);
+            lane0[4] = add_asm(d4_a, sum_a);
+            lane1[4] = add_asm(d4_b, sum_b);
+            lane0[5] = sub_asm(sum_a, d5_a);
+            lane1[5] = sub_asm(sum_b, d5_b);
+            lane0[6] = sub_asm(sum_a, d6_a);
+            lane1[6] = sub_asm(sum_b, d6_b);
+            lane0[7] = sub_asm(sum_a, d7_a);
+            lane1[7] = sub_asm(sum_b, d7_b);
+        }
+    }
+    lane0[0] = s0_a;
+    lane1[0] = s0_b;
+}
+
+/// Split-state W12 internal permute: s0 stays in a register across all rounds.
+#[inline]
+pub fn internal_permute_state_asm_w12(state: &mut [u64; 12], constants: &[u64]) {
+    let mut s0 = state[0];
+    for &rc in constants {
+        unsafe {
+            s0 = add_asm(s0, rc);
+            let s0_2 = mul_asm(s0, s0);
+
+            let sum1 = add_asm(state[1], state[2]);
+            let sum2 = add_asm(state[3], state[4]);
+            let sum3 = add_asm(state[5], state[6]);
+            let sum4 = add_asm(state[7], state[8]);
+            let sum5 = add_asm(state[9], state[10]);
+
+            let s0_3 = mul_asm(s0_2, s0);
+            let s0_4 = mul_asm(s0_2, s0_2);
+
+            let sum12 = add_asm(sum1, sum2);
+            let sum34 = add_asm(sum3, sum4);
+            let sum511 = add_asm(sum5, state[11]);
+
+            let d1 = state[1];
+            let d2 = double_asm(state[2]);
+            let d3 = div2_asm(state[3]);
+            let d4 = add_asm(double_asm(state[4]), state[4]);
+
+            let sum1234 = add_asm(sum12, sum34);
+
+            let d5 = double_asm(double_asm(state[5]));
+            let d6 = div2_asm(state[6]);
+            let d7 = add_asm(double_asm(state[7]), state[7]);
+            let d8 = double_asm(double_asm(state[8]));
+
+            let sum_hi = add_asm(sum1234, sum511);
+
+            let d9 = div4_asm(state[9]);
+            let d10 = div4_asm(state[10]);
+            let d11 = div8_asm(state[11]);
+
+            s0 = mul_asm(s0_3, s0_4);
+            let sum = add_asm(sum_hi, s0);
+            s0 = sub_asm(sum_hi, s0);
+
+            state[1] = add_asm(d1, sum);
+            state[2] = add_asm(d2, sum);
+            state[3] = add_asm(d3, sum);
+            state[4] = add_asm(d4, sum);
+            state[5] = add_asm(d5, sum);
+            state[6] = sub_asm(sum, d6);
+            state[7] = sub_asm(sum, d7);
+            state[8] = sub_asm(sum, d8);
+            state[9] = add_asm(d9, sum);
+            state[10] = sub_asm(sum, d10);
+            state[11] = add_asm(d11, sum);
+        }
+    }
+    state[0] = s0;
+}
+
+/// Split-state dual-lane W12 internal permute for packed processing.
+#[inline]
+pub fn internal_permute_split_dual_w12(
+    lane0: &mut [u64; 12],
+    lane1: &mut [u64; 12],
+    constants: &[u64],
+) {
+    let mut s0_a = lane0[0];
+    let mut s0_b = lane1[0];
+    for &rc in constants {
+        unsafe {
+            s0_a = add_asm(s0_a, rc);
+            s0_b = add_asm(s0_b, rc);
+
+            let s0_2_a = mul_asm(s0_a, s0_a);
+            let s0_2_b = mul_asm(s0_b, s0_b);
+
+            let sum1_a = add_asm(lane0[1], lane0[2]);
+            let sum1_b = add_asm(lane1[1], lane1[2]);
+            let sum2_a = add_asm(lane0[3], lane0[4]);
+            let sum2_b = add_asm(lane1[3], lane1[4]);
+            let sum3_a = add_asm(lane0[5], lane0[6]);
+            let sum3_b = add_asm(lane1[5], lane1[6]);
+            let sum4_a = add_asm(lane0[7], lane0[8]);
+            let sum4_b = add_asm(lane1[7], lane1[8]);
+            let sum5_a = add_asm(lane0[9], lane0[10]);
+            let sum5_b = add_asm(lane1[9], lane1[10]);
+
+            let s0_3_a = mul_asm(s0_2_a, s0_a);
+            let s0_3_b = mul_asm(s0_2_b, s0_b);
+            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
+            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
+
+            let sum12_a = add_asm(sum1_a, sum2_a);
+            let sum12_b = add_asm(sum1_b, sum2_b);
+            let sum34_a = add_asm(sum3_a, sum4_a);
+            let sum34_b = add_asm(sum3_b, sum4_b);
+            let sum511_a = add_asm(sum5_a, lane0[11]);
+            let sum511_b = add_asm(sum5_b, lane1[11]);
+
+            let d1_a = lane0[1];
+            let d1_b = lane1[1];
+            let d2_a = double_asm(lane0[2]);
+            let d2_b = double_asm(lane1[2]);
+            let d3_a = div2_asm(lane0[3]);
+            let d3_b = div2_asm(lane1[3]);
+            let d4_a = add_asm(double_asm(lane0[4]), lane0[4]);
+            let d4_b = add_asm(double_asm(lane1[4]), lane1[4]);
+
+            let sum1234_a = add_asm(sum12_a, sum34_a);
+            let sum1234_b = add_asm(sum12_b, sum34_b);
+
+            let d5_a = double_asm(double_asm(lane0[5]));
+            let d5_b = double_asm(double_asm(lane1[5]));
+            let d6_a = div2_asm(lane0[6]);
+            let d6_b = div2_asm(lane1[6]);
+            let d7_a = add_asm(double_asm(lane0[7]), lane0[7]);
+            let d7_b = add_asm(double_asm(lane1[7]), lane1[7]);
+            let d8_a = double_asm(double_asm(lane0[8]));
+            let d8_b = double_asm(double_asm(lane1[8]));
+
+            let sum_hi_a = add_asm(sum1234_a, sum511_a);
+            let sum_hi_b = add_asm(sum1234_b, sum511_b);
+
+            let d9_a = div4_asm(lane0[9]);
+            let d9_b = div4_asm(lane1[9]);
+            let d10_a = div4_asm(lane0[10]);
+            let d10_b = div4_asm(lane1[10]);
+            let d11_a = div8_asm(lane0[11]);
+            let d11_b = div8_asm(lane1[11]);
+
+            s0_a = mul_asm(s0_3_a, s0_4_a);
+            s0_b = mul_asm(s0_3_b, s0_4_b);
+
+            let sum_a = add_asm(sum_hi_a, s0_a);
+            let sum_b = add_asm(sum_hi_b, s0_b);
+            s0_a = sub_asm(sum_hi_a, s0_a);
+            s0_b = sub_asm(sum_hi_b, s0_b);
+
+            lane0[1] = add_asm(d1_a, sum_a);
+            lane1[1] = add_asm(d1_b, sum_b);
+            lane0[2] = add_asm(d2_a, sum_a);
+            lane1[2] = add_asm(d2_b, sum_b);
+            lane0[3] = add_asm(d3_a, sum_a);
+            lane1[3] = add_asm(d3_b, sum_b);
+            lane0[4] = add_asm(d4_a, sum_a);
+            lane1[4] = add_asm(d4_b, sum_b);
+            lane0[5] = add_asm(d5_a, sum_a);
+            lane1[5] = add_asm(d5_b, sum_b);
+            lane0[6] = sub_asm(sum_a, d6_a);
+            lane1[6] = sub_asm(sum_b, d6_b);
+            lane0[7] = sub_asm(sum_a, d7_a);
+            lane1[7] = sub_asm(sum_b, d7_b);
+            lane0[8] = sub_asm(sum_a, d8_a);
+            lane1[8] = sub_asm(sum_b, d8_b);
+            lane0[9] = add_asm(d9_a, sum_a);
+            lane1[9] = add_asm(d9_b, sum_b);
+            lane0[10] = sub_asm(sum_a, d10_a);
+            lane1[10] = sub_asm(sum_b, d10_b);
+            lane0[11] = add_asm(d11_a, sum_a);
+            lane1[11] = add_asm(d11_b, sum_b);
+        }
+    }
+    lane0[0] = s0_a;
+    lane1[0] = s0_b;
+}
+
+/// Split-state W16 internal permute: s0 stays in a register across all rounds.
+#[inline]
+pub fn internal_permute_state_asm_w16(state: &mut [u64; 16], constants: &[u64]) {
+    let mut s0 = state[0];
+    for &rc in constants {
+        unsafe {
+            s0 = add_asm(s0, rc);
+            let s0_2 = mul_asm(s0, s0);
+
+            let sum1 = add_asm(state[1], state[2]);
+            let sum2 = add_asm(state[3], state[4]);
+            let sum3 = add_asm(state[5], state[6]);
+            let sum4 = add_asm(state[7], state[8]);
+            let sum5 = add_asm(state[9], state[10]);
+            let sum6 = add_asm(state[11], state[12]);
+            let sum7 = add_asm(state[13], state[14]);
+
+            let s0_3 = mul_asm(s0_2, s0);
+            let s0_4 = mul_asm(s0_2, s0_2);
+
+            let sum12 = add_asm(sum1, sum2);
+            let sum34 = add_asm(sum3, sum4);
+            let sum56 = add_asm(sum5, sum6);
+            let sum715 = add_asm(sum7, state[15]);
+
+            let sum1234 = add_asm(sum12, sum34);
+            let sum56715 = add_asm(sum56, sum715);
+            let sum_hi = add_asm(sum1234, sum56715);
+
+            let d1 = state[1];
+            let d2 = double_asm(state[2]);
+            let d3 = div2_asm(state[3]);
+            let d4 = add_asm(double_asm(state[4]), state[4]);
+            let d5 = double_asm(double_asm(state[5]));
+            let d6 = div2_asm(state[6]);
+            let d7 = add_asm(double_asm(state[7]), state[7]);
+            let d8 = double_asm(double_asm(state[8]));
+
+            let d9 = div8_asm(state[9]);
+            let d10 = div16_asm(state[10]);
+            let d11 = div32_asm(state[11]);
+            let d12 = div8_asm(state[12]);
+            let d13 = div16_asm(state[13]);
+            let d14 = div32_asm(state[14]);
+            let d15 = div_2_32_asm(state[15]);
+
+            s0 = mul_asm(s0_3, s0_4);
+            let sum = add_asm(sum_hi, s0);
+            s0 = sub_asm(sum_hi, s0);
+
+            state[1] = add_asm(d1, sum);
+            state[2] = add_asm(d2, sum);
+            state[3] = add_asm(d3, sum);
+            state[4] = add_asm(d4, sum);
+            state[5] = add_asm(d5, sum);
+            state[6] = sub_asm(sum, d6);
+            state[7] = sub_asm(sum, d7);
+            state[8] = sub_asm(sum, d8);
+            state[9] = add_asm(d9, sum);
+            state[10] = add_asm(d10, sum);
+            state[11] = add_asm(d11, sum);
+            state[12] = sub_asm(sum, d12);
+            state[13] = sub_asm(sum, d13);
+            state[14] = sub_asm(sum, d14);
+            state[15] = add_asm(d15, sum);
+        }
+    }
+    state[0] = s0;
+}
+
+/// Split-state dual-lane W16 internal permute for packed processing.
+#[inline]
+pub fn internal_permute_split_dual_w16(
+    lane0: &mut [u64; 16],
+    lane1: &mut [u64; 16],
+    constants: &[u64],
+) {
+    let mut s0_a = lane0[0];
+    let mut s0_b = lane1[0];
+    for &rc in constants {
+        unsafe {
+            s0_a = add_asm(s0_a, rc);
+            s0_b = add_asm(s0_b, rc);
+
+            let s0_2_a = mul_asm(s0_a, s0_a);
+            let s0_2_b = mul_asm(s0_b, s0_b);
+
+            let sum1_a = add_asm(lane0[1], lane0[2]);
+            let sum1_b = add_asm(lane1[1], lane1[2]);
+            let sum2_a = add_asm(lane0[3], lane0[4]);
+            let sum2_b = add_asm(lane1[3], lane1[4]);
+            let sum3_a = add_asm(lane0[5], lane0[6]);
+            let sum3_b = add_asm(lane1[5], lane1[6]);
+            let sum4_a = add_asm(lane0[7], lane0[8]);
+            let sum4_b = add_asm(lane1[7], lane1[8]);
+            let sum5_a = add_asm(lane0[9], lane0[10]);
+            let sum5_b = add_asm(lane1[9], lane1[10]);
+            let sum6_a = add_asm(lane0[11], lane0[12]);
+            let sum6_b = add_asm(lane1[11], lane1[12]);
+            let sum7_a = add_asm(lane0[13], lane0[14]);
+            let sum7_b = add_asm(lane1[13], lane1[14]);
+
+            let s0_3_a = mul_asm(s0_2_a, s0_a);
+            let s0_3_b = mul_asm(s0_2_b, s0_b);
+            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
+            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
+
+            let sum12_a = add_asm(sum1_a, sum2_a);
+            let sum12_b = add_asm(sum1_b, sum2_b);
+            let sum34_a = add_asm(sum3_a, sum4_a);
+            let sum34_b = add_asm(sum3_b, sum4_b);
+            let sum56_a = add_asm(sum5_a, sum6_a);
+            let sum56_b = add_asm(sum5_b, sum6_b);
+            let sum715_a = add_asm(sum7_a, lane0[15]);
+            let sum715_b = add_asm(sum7_b, lane1[15]);
+
+            let sum1234_a = add_asm(sum12_a, sum34_a);
+            let sum1234_b = add_asm(sum12_b, sum34_b);
+            let sum56715_a = add_asm(sum56_a, sum715_a);
+            let sum56715_b = add_asm(sum56_b, sum715_b);
+            let sum_hi_a = add_asm(sum1234_a, sum56715_a);
+            let sum_hi_b = add_asm(sum1234_b, sum56715_b);
+
+            let d1_a = lane0[1];
+            let d1_b = lane1[1];
+            let d2_a = double_asm(lane0[2]);
+            let d2_b = double_asm(lane1[2]);
+            let d3_a = div2_asm(lane0[3]);
+            let d3_b = div2_asm(lane1[3]);
+            let d4_a = add_asm(double_asm(lane0[4]), lane0[4]);
+            let d4_b = add_asm(double_asm(lane1[4]), lane1[4]);
+            let d5_a = double_asm(double_asm(lane0[5]));
+            let d5_b = double_asm(double_asm(lane1[5]));
+            let d6_a = div2_asm(lane0[6]);
+            let d6_b = div2_asm(lane1[6]);
+            let d7_a = add_asm(double_asm(lane0[7]), lane0[7]);
+            let d7_b = add_asm(double_asm(lane1[7]), lane1[7]);
+            let d8_a = double_asm(double_asm(lane0[8]));
+            let d8_b = double_asm(double_asm(lane1[8]));
+
+            let d9_a = div8_asm(lane0[9]);
+            let d9_b = div8_asm(lane1[9]);
+            let d10_a = div16_asm(lane0[10]);
+            let d10_b = div16_asm(lane1[10]);
+            let d11_a = div32_asm(lane0[11]);
+            let d11_b = div32_asm(lane1[11]);
+            let d12_a = div8_asm(lane0[12]);
+            let d12_b = div8_asm(lane1[12]);
+            let d13_a = div16_asm(lane0[13]);
+            let d13_b = div16_asm(lane1[13]);
+            let d14_a = div32_asm(lane0[14]);
+            let d14_b = div32_asm(lane1[14]);
+            let d15_a = div_2_32_asm(lane0[15]);
+            let d15_b = div_2_32_asm(lane1[15]);
+
+            s0_a = mul_asm(s0_3_a, s0_4_a);
+            s0_b = mul_asm(s0_3_b, s0_4_b);
+
+            let sum_a = add_asm(sum_hi_a, s0_a);
+            let sum_b = add_asm(sum_hi_b, s0_b);
+            s0_a = sub_asm(sum_hi_a, s0_a);
+            s0_b = sub_asm(sum_hi_b, s0_b);
+
+            lane0[1] = add_asm(d1_a, sum_a);
+            lane1[1] = add_asm(d1_b, sum_b);
+            lane0[2] = add_asm(d2_a, sum_a);
+            lane1[2] = add_asm(d2_b, sum_b);
+            lane0[3] = add_asm(d3_a, sum_a);
+            lane1[3] = add_asm(d3_b, sum_b);
+            lane0[4] = add_asm(d4_a, sum_a);
+            lane1[4] = add_asm(d4_b, sum_b);
+            lane0[5] = add_asm(d5_a, sum_a);
+            lane1[5] = add_asm(d5_b, sum_b);
+            lane0[6] = sub_asm(sum_a, d6_a);
+            lane1[6] = sub_asm(sum_b, d6_b);
+            lane0[7] = sub_asm(sum_a, d7_a);
+            lane1[7] = sub_asm(sum_b, d7_b);
+            lane0[8] = sub_asm(sum_a, d8_a);
+            lane1[8] = sub_asm(sum_b, d8_b);
+            lane0[9] = add_asm(d9_a, sum_a);
+            lane1[9] = add_asm(d9_b, sum_b);
+            lane0[10] = add_asm(d10_a, sum_a);
+            lane1[10] = add_asm(d10_b, sum_b);
+            lane0[11] = add_asm(d11_a, sum_a);
+            lane1[11] = add_asm(d11_b, sum_b);
+            lane0[12] = sub_asm(sum_a, d12_a);
+            lane1[12] = sub_asm(sum_b, d12_b);
+            lane0[13] = sub_asm(sum_a, d13_a);
+            lane1[13] = sub_asm(sum_b, d13_b);
+            lane0[14] = sub_asm(sum_a, d14_a);
+            lane1[14] = sub_asm(sum_b, d14_b);
+            lane0[15] = add_asm(d15_a, sum_a);
+            lane1[15] = add_asm(d15_b, sum_b);
+        }
+    }
+    lane0[0] = s0_a;
+    lane1[0] = s0_b;
+}
+
+// External layer: S-box on all elements, then MDS. Pipelined for latency hiding.
+
+/// Double a Goldilocks element.
+#[inline(always)]
+unsafe fn double_asm(a: u64) -> u64 {
+    // SAFETY: add_asm is safe with valid Goldilocks field elements
+    unsafe { add_asm(a, a) }
+}
+
+/// 4x4 circulant MDS with coefficients [2,3,1,1].
+#[inline(always)]
+unsafe fn apply_mat4_asm(x: &mut [u64; 4]) {
+    unsafe {
+        let t01 = add_asm(x[0], x[1]);
+        let t23 = add_asm(x[2], x[3]);
+        let t0123 = add_asm(t01, t23);
+        let t01123 = add_asm(t0123, x[1]);
+        let t01233 = add_asm(t0123, x[3]);
+
+        let y3 = add_asm(t01233, double_asm(x[0]));
+        let y1 = add_asm(t01123, double_asm(x[2]));
+        let y0 = add_asm(t01123, t01);
+        let y2 = add_asm(t01233, t23);
+
+        x[0] = y0;
+        x[1] = y1;
+        x[2] = y2;
+        x[3] = y3;
+    }
+}
+
+/// Poseidon2 MDS light permutation: 4x4 blocks + outer sums.
+#[inline(always)]
+pub unsafe fn mds_light_permutation_asm<const WIDTH: usize>(state: &mut [u64; WIDTH]) {
+    unsafe {
+        // Apply M_4 to each consecutive four elements
+        let mut i = 0;
+        while i < WIDTH {
+            let chunk: &mut [u64; 4] = (&mut state[i..i + 4]).try_into().unwrap();
+            apply_mat4_asm(chunk);
+            i += 4;
+        }
+
+        // Compute the four sums of every 4th element
+        let mut sums = [0u64; 4];
+        for j in (0..WIDTH).step_by(4) {
+            sums[0] = add_asm(sums[0], state[j]);
+            sums[1] = add_asm(sums[1], state[j + 1]);
+            sums[2] = add_asm(sums[2], state[j + 2]);
+            sums[3] = add_asm(sums[3], state[j + 3]);
+        }
+
+        // Add sums back to state
+        for (i, elem) in state.iter_mut().enumerate() {
+            *elem = add_asm(*elem, sums[i % 4]);
+        }
+    }
+}
+
+/// Pipelined S-box computation for all elements.
+/// Computes x^7 for all elements by interleaving stages to hide latency.
+#[inline(always)]
+pub unsafe fn sbox_layer_asm<const WIDTH: usize>(state: &mut [u64; WIDTH]) {
+    unsafe {
+        // Stage 1: Compute all x^2 values
+        let mut x2 = [0u64; WIDTH];
+        for i in 0..WIDTH {
+            x2[i] = mul_asm(state[i], state[i]);
+        }
+
+        // Stage 2: Compute x^3 and x^4 values interleaved
+        // x^3 = x^2 * x, x^4 = x^2 * x^2
+        let mut x3 = [0u64; WIDTH];
+        let mut x4 = [0u64; WIDTH];
+        for i in 0..WIDTH {
+            x3[i] = mul_asm(x2[i], state[i]);
+            x4[i] = mul_asm(x2[i], x2[i]);
+        }
+
+        // Stage 3: Compute x^7 = x^3 * x^4
+        for i in 0..WIDTH {
+            state[i] = mul_asm(x3[i], x4[i]);
+        }
+    }
+}
+
+/// Optimized external round: add RC, S-box, MDS.
+#[inline(always)]
+pub unsafe fn external_round_asm<const WIDTH: usize>(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) {
+    unsafe {
+        // Add round constants
+        for i in 0..WIDTH {
+            state[i] = add_asm(state[i], rc[i]);
+        }
+
+        // Apply S-box (x^7) to all elements
+        sbox_layer_asm(state);
+
+        // Apply MDS light permutation
+        mds_light_permutation_asm(state);
+    }
+}
+
+/// Interleaved dual-lane S-box layer for better ILP.
+#[inline(always)]
+pub unsafe fn sbox_layer_dual_asm<const WIDTH: usize>(
+    state0: &mut [u64; WIDTH],
+    state1: &mut [u64; WIDTH],
+) {
+    unsafe {
+        // Stage 1: Compute all x^2 values for both lanes (interleaved)
+        let mut x2_a = [0u64; WIDTH];
+        let mut x2_b = [0u64; WIDTH];
+        for i in 0..WIDTH {
+            x2_a[i] = mul_asm(state0[i], state0[i]);
+            x2_b[i] = mul_asm(state1[i], state1[i]);
+        }
+
+        // Stage 2: Compute x^3 and x^4 for both lanes (interleaved)
+        let mut x3_a = [0u64; WIDTH];
+        let mut x3_b = [0u64; WIDTH];
+        let mut x4_a = [0u64; WIDTH];
+        let mut x4_b = [0u64; WIDTH];
+        for i in 0..WIDTH {
+            x3_a[i] = mul_asm(x2_a[i], state0[i]);
+            x3_b[i] = mul_asm(x2_b[i], state1[i]);
+            x4_a[i] = mul_asm(x2_a[i], x2_a[i]);
+            x4_b[i] = mul_asm(x2_b[i], x2_b[i]);
+        }
+
+        // Stage 3: Compute x^7 = x^3 * x^4 for both lanes
+        for i in 0..WIDTH {
+            state0[i] = mul_asm(x3_a[i], x4_a[i]);
+            state1[i] = mul_asm(x3_b[i], x4_b[i]);
+        }
+    }
+}
+
+/// Interleaved dual-lane external round for better ILP.
+#[inline(always)]
+pub unsafe fn external_round_dual_asm<const WIDTH: usize>(
+    state0: &mut [u64; WIDTH],
+    state1: &mut [u64; WIDTH],
+    rc: &[u64; WIDTH],
+) {
+    unsafe {
+        // Add round constants (interleaved)
+        for i in 0..WIDTH {
+            state0[i] = add_asm(state0[i], rc[i]);
+            state1[i] = add_asm(state1[i], rc[i]);
+        }
+
+        // Apply S-box (interleaved dual-lane)
+        sbox_layer_dual_asm(state0, state1);
+
+        // Apply MDS (sequential - MDS is mostly additions which are fast)
+        mds_light_permutation_asm(state0);
+        mds_light_permutation_asm(state1);
+    }
+}
+
+/// Fully unrolled and fused external round for W8.
+#[inline(always)]
+pub unsafe fn external_round_fused_w8(state: &mut [u64; 8], rc: &[u64; 8]) {
+    unsafe {
+        let s0 = add_asm(state[0], rc[0]);
+        let s1 = add_asm(state[1], rc[1]);
+        let x2_0 = mul_asm(s0, s0);
+        let x2_1 = mul_asm(s1, s1);
+
+        let s2 = add_asm(state[2], rc[2]);
+        let s3 = add_asm(state[3], rc[3]);
+        let x2_2 = mul_asm(s2, s2);
+        let x2_3 = mul_asm(s3, s3);
+
+        let s4 = add_asm(state[4], rc[4]);
+        let s5 = add_asm(state[5], rc[5]);
+        let x2_4 = mul_asm(s4, s4);
+        let x2_5 = mul_asm(s5, s5);
+
+        let s6 = add_asm(state[6], rc[6]);
+        let s7 = add_asm(state[7], rc[7]);
+        let x2_6 = mul_asm(s6, s6);
+        let x2_7 = mul_asm(s7, s7);
+
+        let x3_0 = mul_asm(x2_0, s0);
+        let x3_1 = mul_asm(x2_1, s1);
+        let x4_0 = mul_asm(x2_0, x2_0);
+        let x4_1 = mul_asm(x2_1, x2_1);
+        let x3_2 = mul_asm(x2_2, s2);
+        let x3_3 = mul_asm(x2_3, s3);
+        let x4_2 = mul_asm(x2_2, x2_2);
+        let x4_3 = mul_asm(x2_3, x2_3);
+        let x3_4 = mul_asm(x2_4, s4);
+        let x3_5 = mul_asm(x2_5, s5);
+        let x4_4 = mul_asm(x2_4, x2_4);
+        let x4_5 = mul_asm(x2_5, x2_5);
+        let x3_6 = mul_asm(x2_6, s6);
+        let x3_7 = mul_asm(x2_7, s7);
+        let x4_6 = mul_asm(x2_6, x2_6);
+        let x4_7 = mul_asm(x2_7, x2_7);
+
+        state[0] = mul_asm(x3_0, x4_0);
+        state[1] = mul_asm(x3_1, x4_1);
+        state[2] = mul_asm(x3_2, x4_2);
+        state[3] = mul_asm(x3_3, x4_3);
+        state[4] = mul_asm(x3_4, x4_4);
+        state[5] = mul_asm(x3_5, x4_5);
+        state[6] = mul_asm(x3_6, x4_6);
+        state[7] = mul_asm(x3_7, x4_7);
+
+        mds_light_permutation_asm(state);
+    }
+}
+
+/// Fully unrolled and fused dual-lane external round for W8.
+#[inline(always)]
+pub unsafe fn external_round_fused_dual_w8(
+    state0: &mut [u64; 8],
+    state1: &mut [u64; 8],
+    rc: &[u64; 8],
+) {
+    unsafe {
+        // Half 1: elements 0-3 across both lanes
+        let s0_a = add_asm(state0[0], rc[0]);
+        let s0_b = add_asm(state1[0], rc[0]);
+        let s1_a = add_asm(state0[1], rc[1]);
+        let s1_b = add_asm(state1[1], rc[1]);
+        let s2_a = add_asm(state0[2], rc[2]);
+        let s2_b = add_asm(state1[2], rc[2]);
+        let s3_a = add_asm(state0[3], rc[3]);
+        let s3_b = add_asm(state1[3], rc[3]);
+
+        let x2_0a = mul_asm(s0_a, s0_a);
+        let x2_0b = mul_asm(s0_b, s0_b);
+        let x2_1a = mul_asm(s1_a, s1_a);
+        let x2_1b = mul_asm(s1_b, s1_b);
+        let x2_2a = mul_asm(s2_a, s2_a);
+        let x2_2b = mul_asm(s2_b, s2_b);
+        let x2_3a = mul_asm(s3_a, s3_a);
+        let x2_3b = mul_asm(s3_b, s3_b);
+
+        let x3_0a = mul_asm(x2_0a, s0_a);
+        let x3_0b = mul_asm(x2_0b, s0_b);
+        let x4_0a = mul_asm(x2_0a, x2_0a);
+        let x4_0b = mul_asm(x2_0b, x2_0b);
+        let x3_1a = mul_asm(x2_1a, s1_a);
+        let x3_1b = mul_asm(x2_1b, s1_b);
+        let x4_1a = mul_asm(x2_1a, x2_1a);
+        let x4_1b = mul_asm(x2_1b, x2_1b);
+        let x3_2a = mul_asm(x2_2a, s2_a);
+        let x3_2b = mul_asm(x2_2b, s2_b);
+        let x4_2a = mul_asm(x2_2a, x2_2a);
+        let x4_2b = mul_asm(x2_2b, x2_2b);
+        let x3_3a = mul_asm(x2_3a, s3_a);
+        let x3_3b = mul_asm(x2_3b, s3_b);
+        let x4_3a = mul_asm(x2_3a, x2_3a);
+        let x4_3b = mul_asm(x2_3b, x2_3b);
+
+        state0[0] = mul_asm(x3_0a, x4_0a);
+        state1[0] = mul_asm(x3_0b, x4_0b);
+        state0[1] = mul_asm(x3_1a, x4_1a);
+        state1[1] = mul_asm(x3_1b, x4_1b);
+        state0[2] = mul_asm(x3_2a, x4_2a);
+        state1[2] = mul_asm(x3_2b, x4_2b);
+        state0[3] = mul_asm(x3_3a, x4_3a);
+        state1[3] = mul_asm(x3_3b, x4_3b);
+
+        // Half 2: elements 4-7 across both lanes
+        let s4_a = add_asm(state0[4], rc[4]);
+        let s4_b = add_asm(state1[4], rc[4]);
+        let s5_a = add_asm(state0[5], rc[5]);
+        let s5_b = add_asm(state1[5], rc[5]);
+        let s6_a = add_asm(state0[6], rc[6]);
+        let s6_b = add_asm(state1[6], rc[6]);
+        let s7_a = add_asm(state0[7], rc[7]);
+        let s7_b = add_asm(state1[7], rc[7]);
+
+        let x2_4a = mul_asm(s4_a, s4_a);
+        let x2_4b = mul_asm(s4_b, s4_b);
+        let x2_5a = mul_asm(s5_a, s5_a);
+        let x2_5b = mul_asm(s5_b, s5_b);
+        let x2_6a = mul_asm(s6_a, s6_a);
+        let x2_6b = mul_asm(s6_b, s6_b);
+        let x2_7a = mul_asm(s7_a, s7_a);
+        let x2_7b = mul_asm(s7_b, s7_b);
+
+        let x3_4a = mul_asm(x2_4a, s4_a);
+        let x3_4b = mul_asm(x2_4b, s4_b);
+        let x4_4a = mul_asm(x2_4a, x2_4a);
+        let x4_4b = mul_asm(x2_4b, x2_4b);
+        let x3_5a = mul_asm(x2_5a, s5_a);
+        let x3_5b = mul_asm(x2_5b, s5_b);
+        let x4_5a = mul_asm(x2_5a, x2_5a);
+        let x4_5b = mul_asm(x2_5b, x2_5b);
+        let x3_6a = mul_asm(x2_6a, s6_a);
+        let x3_6b = mul_asm(x2_6b, s6_b);
+        let x4_6a = mul_asm(x2_6a, x2_6a);
+        let x4_6b = mul_asm(x2_6b, x2_6b);
+        let x3_7a = mul_asm(x2_7a, s7_a);
+        let x3_7b = mul_asm(x2_7b, s7_b);
+        let x4_7a = mul_asm(x2_7a, x2_7a);
+        let x4_7b = mul_asm(x2_7b, x2_7b);
+
+        state0[4] = mul_asm(x3_4a, x4_4a);
+        state1[4] = mul_asm(x3_4b, x4_4b);
+        state0[5] = mul_asm(x3_5a, x4_5a);
+        state1[5] = mul_asm(x3_5b, x4_5b);
+        state0[6] = mul_asm(x3_6a, x4_6a);
+        state1[6] = mul_asm(x3_6b, x4_6b);
+        state0[7] = mul_asm(x3_7a, x4_7a);
+        state1[7] = mul_asm(x3_7b, x4_7b);
+
+        mds_light_permutation_asm(state0);
+        mds_light_permutation_asm(state1);
+    }
+}
+
+/// Run initial external rounds with pre-converted raw u64 constants.
+#[inline]
+pub fn external_initial_permute_state_asm<const WIDTH: usize>(
+    state: &mut [u64; WIDTH],
+    initial_constants: &[[u64; WIDTH]],
+) {
+    unsafe {
+        mds_light_permutation_asm(state);
+    }
+    for rc in initial_constants {
+        unsafe {
+            external_round_asm(state, rc);
+        }
+    }
+}
+
+/// Run terminal external rounds with pre-converted raw u64 constants.
+#[inline]
+pub fn external_terminal_permute_state_asm<const WIDTH: usize>(
+    state: &mut [u64; WIDTH],
+    terminal_constants: &[[u64; WIDTH]],
+) {
+    for rc in terminal_constants {
+        unsafe {
+            external_round_asm(state, rc);
+        }
+    }
+}
+
+/// W8-specialized initial external permute using fused rounds.
+#[inline]
+pub fn external_initial_permute_w8(state: &mut [u64; 8], initial_constants: &[[u64; 8]]) {
+    unsafe {
+        mds_light_permutation_asm(state);
+    }
+    for rc in initial_constants {
+        unsafe {
+            external_round_fused_w8(state, rc);
+        }
+    }
+}
+
+/// W8-specialized terminal external permute using fused rounds.
+#[inline]
+pub fn external_terminal_permute_w8(state: &mut [u64; 8], terminal_constants: &[[u64; 8]]) {
+    for rc in terminal_constants {
+        unsafe {
+            external_round_fused_w8(state, rc);
+        }
+    }
+}
+
+/// Dual-lane initial external permute with pre-converted constants.
+#[inline]
+pub fn external_initial_permute_dual<const WIDTH: usize>(
+    lane0: &mut [u64; WIDTH],
+    lane1: &mut [u64; WIDTH],
+    constants: &[[u64; WIDTH]],
+) {
+    unsafe {
+        mds_light_permutation_asm(lane0);
+        mds_light_permutation_asm(lane1);
+    }
+    for rc in constants {
+        unsafe {
+            external_round_dual_asm(lane0, lane1, rc);
+        }
+    }
+}
+
+/// Dual-lane terminal external permute with pre-converted constants.
+#[inline]
+pub fn external_terminal_permute_dual<const WIDTH: usize>(
+    lane0: &mut [u64; WIDTH],
+    lane1: &mut [u64; WIDTH],
+    constants: &[[u64; WIDTH]],
+) {
+    for rc in constants {
+        unsafe {
+            external_round_dual_asm(lane0, lane1, rc);
+        }
+    }
+}
+
+/// W8-specialized dual-lane initial external permute using fused rounds.
+#[inline]
+pub fn external_initial_permute_dual_w8(
+    lane0: &mut [u64; 8],
+    lane1: &mut [u64; 8],
+    constants: &[[u64; 8]],
+) {
+    unsafe {
+        mds_light_permutation_asm(lane0);
+        mds_light_permutation_asm(lane1);
+    }
+    for rc in constants {
+        unsafe {
+            external_round_fused_dual_w8(lane0, lane1, rc);
+        }
+    }
+}
+
+/// W8-specialized dual-lane terminal external permute using fused rounds.
+#[inline]
+pub fn external_terminal_permute_dual_w8(
+    lane0: &mut [u64; 8],
+    lane1: &mut [u64; 8],
+    constants: &[[u64; 8]],
+) {
+    for rc in constants {
+        unsafe {
+            external_round_fused_dual_w8(lane0, lane1, rc);
+        }
+    }
+}
+
+// NEON 2-wide Goldilocks field primitives.
+// Each operates on both packed lanes simultaneously using uint64x2_t.
+
+#[inline(always)]
+unsafe fn add_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let res = vaddq_u64(a, b);
+        let overflow = vcgtq_u64(a, res);
+        let adj = vshrq_n_u64::<32>(overflow);
+        vaddq_u64(res, adj)
+    }
+}
+
+#[inline(always)]
+unsafe fn sub_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let res = vsubq_u64(a, b);
+        let underflow = vcgtq_u64(b, a);
+        let adj = vshrq_n_u64::<32>(underflow);
+        vsubq_u64(res, adj)
+    }
+}
+
+#[inline(always)]
+unsafe fn double_neon(a: uint64x2_t) -> uint64x2_t {
+    unsafe { add_neon(a, a) }
+}
+
+#[inline(always)]
+unsafe fn div2_neon(x: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let half_p_plus_1 = vdupq_n_u64((P + 1) >> 1);
+        let one = vdupq_n_u64(1);
+        let is_odd = vandq_u64(x, one);
+        let half = vshrq_n_u64::<1>(x);
+        let mask = vtstq_u64(is_odd, is_odd);
+        let adj = vandq_u64(mask, half_p_plus_1);
+        vaddq_u64(half, adj)
+    }
+}
+
+#[inline(always)]
+unsafe fn div4_neon(x: uint64x2_t) -> uint64x2_t {
+    unsafe { div2_neon(div2_neon(x)) }
+}
+
+#[inline(always)]
+unsafe fn div8_neon(x: uint64x2_t) -> uint64x2_t {
+    unsafe { div2_neon(div4_neon(x)) }
+}
+
+#[inline(always)]
+unsafe fn div16_neon(x: uint64x2_t) -> uint64x2_t {
+    unsafe { div2_neon(div8_neon(x)) }
+}
+
+#[inline(always)]
+unsafe fn div32_neon(x: uint64x2_t) -> uint64x2_t {
+    unsafe { div4_neon(div8_neon(x)) }
+}
+
+/// Compute x * 2^{-32} mod P for each lane using Goldilocks structure.
+///
+/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P).
+/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P.
+#[inline(always)]
+unsafe fn div_2_32_neon(x: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let mask_32 = vdupq_n_u64(0xFFFFFFFF);
+        let hi = vshrq_n_u64::<32>(x);
+        let lo = vandq_u64(x, mask_32);
+        let sum = vaddq_u64(hi, lo);
+        let t = vshlq_n_u64::<32>(lo);
+        sub_neon(sum, t)
+    }
+}
+
+#[inline(always)]
+unsafe fn apply_mat4_neon(x: &mut [uint64x2_t; 4]) {
+    unsafe {
+        let t01 = add_neon(x[0], x[1]);
+        let t23 = add_neon(x[2], x[3]);
+        let t0123 = add_neon(t01, t23);
+        let t01123 = add_neon(t0123, x[1]);
+        let t01233 = add_neon(t0123, x[3]);
+        x[3] = add_neon(t01233, double_neon(x[0]));
+        x[1] = add_neon(t01123, double_neon(x[2]));
+        x[0] = add_neon(t01123, t01);
+        x[2] = add_neon(t01233, t23);
+    }
+}
+
+#[inline(always)]
+unsafe fn mds_light_neon<const WIDTH: usize>(state: &mut [uint64x2_t; WIDTH]) {
+    unsafe {
+        let mut i = 0;
+        while i < WIDTH {
+            let chunk: &mut [uint64x2_t; 4] = (&mut state[i..i + 4]).try_into().unwrap();
+            apply_mat4_neon(chunk);
+            i += 4;
+        }
+        let zero = vdupq_n_u64(0);
+        let mut sums = [zero; 4];
+        for j in (0..WIDTH).step_by(4) {
+            sums[0] = add_neon(sums[0], state[j]);
+            sums[1] = add_neon(sums[1], state[j + 1]);
+            sums[2] = add_neon(sums[2], state[j + 2]);
+            sums[3] = add_neon(sums[3], state[j + 3]);
+        }
+        for (i, elem) in state.iter_mut().enumerate() {
+            *elem = add_neon(*elem, sums[i % 4]);
+        }
+    }
+}
+
+/// Convert separate lane arrays into NEON vector array.
+#[inline]
+pub fn lanes_to_neon<const WIDTH: usize>(
+    lane0: &[u64; WIDTH],
+    lane1: &[u64; WIDTH],
+) -> [uint64x2_t; WIDTH] {
+    core::array::from_fn(|i| unsafe {
+        let lo = vcreate_u64(lane0[i]);
+        let hi = vcreate_u64(lane1[i]);
+        vcombine_u64(lo, hi)
+    })
+}
+
+/// Convert NEON vector array back to separate lane arrays.
+#[inline]
+pub fn neon_to_lanes<const WIDTH: usize>(
+    state_v: &[uint64x2_t; WIDTH],
+    lane0: &mut [u64; WIDTH],
+    lane1: &mut [u64; WIDTH],
+) {
+    for i in 0..WIDTH {
+        unsafe {
+            lane0[i] = vgetq_lane_u64::<0>(state_v[i]);
+            lane1[i] = vgetq_lane_u64::<1>(state_v[i]);
+        }
+    }
+}
+
+// NEON-based internal permutation: both packed lanes processed
+// simultaneously via uint64x2_t for sum tree, diagonal, and writeback.
+
+#[inline]
+pub fn internal_permute_neon_w12(state: &mut [uint64x2_t; 12], constants: &[u64]) {
+    let mut s0 = state[0];
+    for &rc in constants {
+        unsafe {
+            let rc_vec = vdupq_n_u64(rc);
+            s0 = add_neon(s0, rc_vec);
+
+            let s0_0 = vgetq_lane_u64::<0>(s0);
+            let s0_1 = vgetq_lane_u64::<1>(s0);
+            let s0_2_0 = mul_asm(s0_0, s0_0);
+            let s0_2_1 = mul_asm(s0_1, s0_1);
+
+            let sum1 = add_neon(state[1], state[2]);
+            let sum2 = add_neon(state[3], state[4]);
+            let sum3 = add_neon(state[5], state[6]);
+            let sum4 = add_neon(state[7], state[8]);
+            let sum5 = add_neon(state[9], state[10]);
+
+            let s0_3_0 = mul_asm(s0_2_0, s0_0);
+            let s0_3_1 = mul_asm(s0_2_1, s0_1);
+            let s0_4_0 = mul_asm(s0_2_0, s0_2_0);
+            let s0_4_1 = mul_asm(s0_2_1, s0_2_1);
+
+            let sum12 = add_neon(sum1, sum2);
+            let sum34 = add_neon(sum3, sum4);
+            let sum511 = add_neon(sum5, state[11]);
+
+            let d1 = state[1];
+            let d2 = double_neon(state[2]);
+            let d3 = div2_neon(state[3]);
+            let d4 = add_neon(double_neon(state[4]), state[4]);
+
+            let sum1234 = add_neon(sum12, sum34);
+
+            let d5 = double_neon(double_neon(state[5]));
+            let d6 = div2_neon(state[6]);
+            let d7 = add_neon(double_neon(state[7]), state[7]);
+            let d8 = double_neon(double_neon(state[8]));
+
+            let sum_hi = add_neon(sum1234, sum511);
+
+            let d9 = div4_neon(state[9]);
+            let d10 = div4_neon(state[10]);
+            let d11 = div8_neon(state[11]);
+
+            let s0_7_0 = mul_asm(s0_3_0, s0_4_0);
+            let s0_7_1 = mul_asm(s0_3_1, s0_4_1);
+            let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1));
+
+            let sum = add_neon(sum_hi, s0_7);
+            s0 = sub_neon(sum_hi, s0_7);
+
+            state[1] = add_neon(d1, sum);
+            state[2] = add_neon(d2, sum);
+            state[3] = add_neon(d3, sum);
+            state[4] = add_neon(d4, sum);
+            state[5] = add_neon(d5, sum);
+            state[6] = sub_neon(sum, d6);
+            state[7] = sub_neon(sum, d7);
+            state[8] = sub_neon(sum, d8);
+            state[9] = add_neon(d9, sum);
+            state[10] = sub_neon(sum, d10);
+            state[11] = add_neon(d11, sum);
+        }
+    }
+    state[0] = s0;
+}
+
+#[inline]
+pub fn internal_permute_neon_w16(state: &mut [uint64x2_t; 16], constants: &[u64]) {
+    let mut s0 = state[0];
+    for &rc in constants {
+        unsafe {
+            let rc_vec = vdupq_n_u64(rc);
+            s0 = add_neon(s0, rc_vec);
+
+            let s0_0 = vgetq_lane_u64::<0>(s0);
+            let s0_1 = vgetq_lane_u64::<1>(s0);
+            let s0_2_0 = mul_asm(s0_0, s0_0);
+            let s0_2_1 = mul_asm(s0_1, s0_1);
+
+            let sum1 = add_neon(state[1], state[2]);
+            let sum2 = add_neon(state[3], state[4]);
+            let sum3 = add_neon(state[5], state[6]);
+            let sum4 = add_neon(state[7], state[8]);
+            let sum5 = add_neon(state[9], state[10]);
+            let sum6 = add_neon(state[11], state[12]);
+            let sum7 = add_neon(state[13], state[14]);
+
+            let s0_3_0 = mul_asm(s0_2_0, s0_0);
+            let s0_3_1 = mul_asm(s0_2_1, s0_1);
+            let s0_4_0 = mul_asm(s0_2_0, s0_2_0);
+            let s0_4_1 = mul_asm(s0_2_1, s0_2_1);
+
+            let sum12 = add_neon(sum1, sum2);
+            let sum34 = add_neon(sum3, sum4);
+            let sum56 = add_neon(sum5, sum6);
+            let sum715 = add_neon(sum7, state[15]);
+
+            let sum1234 = add_neon(sum12, sum34);
+            let sum56715 = add_neon(sum56, sum715);
+            let sum_hi = add_neon(sum1234, sum56715);
+
+            let d1 = state[1];
+            let d2 = double_neon(state[2]);
+            let d3 = div2_neon(state[3]);
+            let d4 = add_neon(double_neon(state[4]), state[4]);
+            let d5 = double_neon(double_neon(state[5]));
+            let d6 = div2_neon(state[6]);
+            let d7 = add_neon(double_neon(state[7]), state[7]);
+            let d8 = double_neon(double_neon(state[8]));
+
+            let d9 = div8_neon(state[9]);
+            let d10 = div16_neon(state[10]);
+            let d11 = div32_neon(state[11]);
+            let d12 = div8_neon(state[12]);
+            let d13 = div16_neon(state[13]);
+            let d14 = div32_neon(state[14]);
+            let d15 = div_2_32_neon(state[15]);
+
+            let s0_7_0 = mul_asm(s0_3_0, s0_4_0);
+            let s0_7_1 = mul_asm(s0_3_1, s0_4_1);
+            let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1));
+
+            let sum = add_neon(sum_hi, s0_7);
+            s0 = sub_neon(sum_hi, s0_7);
+
+            state[1] = add_neon(d1, sum);
+            state[2] = add_neon(d2, sum);
+            state[3] = add_neon(d3, sum);
+            state[4] = add_neon(d4, sum);
+            state[5] = add_neon(d5, sum);
+            state[6] = sub_neon(sum, d6);
+            state[7] = sub_neon(sum, d7);
+            state[8] = sub_neon(sum, d8);
+            state[9] = add_neon(d9, sum);
+            state[10] = add_neon(d10, sum);
+            state[11] = add_neon(d11, sum);
+            state[12] = sub_neon(sum, d12);
+            state[13] = sub_neon(sum, d13);
+            state[14] = sub_neon(sum, d14);
+            state[15] = add_neon(d15, sum);
+        }
+    }
+    state[0] = s0;
+}
+
+#[inline]
+pub fn internal_permute_neon<const WIDTH: usize>(
+    state: &mut [uint64x2_t; WIDTH],
+    diag: &[u64; WIDTH],
+    constants: &[u64],
+) {
+    let mut s0 = state[0];
+    for &rc in constants {
+        unsafe {
+            let rc_vec = vdupq_n_u64(rc);
+            s0 = add_neon(s0, rc_vec);
+
+            let s0_0 = vgetq_lane_u64::<0>(s0);
+            let s0_1 = vgetq_lane_u64::<1>(s0);
+            let s0_2_0 = mul_asm(s0_0, s0_0);
+            let s0_2_1 = mul_asm(s0_1, s0_1);
+            let s0_3_0 = mul_asm(s0_2_0, s0_0);
+            let s0_3_1 = mul_asm(s0_2_1, s0_1);
+            let s0_4_0 = mul_asm(s0_2_0, s0_2_0);
+            let s0_4_1 = mul_asm(s0_2_1, s0_2_1);
+            let s0_7_0 = mul_asm(s0_3_0, s0_4_0);
+            let s0_7_1 = mul_asm(s0_3_1, s0_4_1);
+            let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1));
+
+            let zero = vdupq_n_u64(0);
+            let mut sum_hi = zero;
+            for &s in state.iter().skip(1) {
+                sum_hi = add_neon(sum_hi, s);
+            }
+
+            let sum = add_neon(sum_hi, s0_7);
+            s0 = vcombine_u64(
+                vcreate_u64(mul_add_asm(s0_7_0, diag[0], vgetq_lane_u64::<0>(sum))),
+                vcreate_u64(mul_add_asm(s0_7_1, diag[0], vgetq_lane_u64::<1>(sum))),
+            );
+
+            for i in 1..WIDTH {
+                let s_0 = mul_add_asm(
+                    vgetq_lane_u64::<0>(state[i]),
+                    diag[i],
+                    vgetq_lane_u64::<0>(sum),
+                );
+                let s_1 = mul_add_asm(
+                    vgetq_lane_u64::<1>(state[i]),
+                    diag[i],
+                    vgetq_lane_u64::<1>(sum),
+                );
+                state[i] = vcombine_u64(vcreate_u64(s_0), vcreate_u64(s_1));
+            }
+        }
+    }
+    state[0] = s0;
+}
+
+// NEON-based external round: S-box stays scalar, MDS uses NEON.
+
+#[inline(always)]
+unsafe fn sbox_neon<const WIDTH: usize>(state: &mut [uint64x2_t; WIDTH]) {
+    unsafe {
+        let mut x2_0 = [0u64; WIDTH];
+        let mut x2_1 = [0u64; WIDTH];
+        for i in 0..WIDTH {
+            let a = vgetq_lane_u64::<0>(state[i]);
+            let b = vgetq_lane_u64::<1>(state[i]);
+            x2_0[i] = mul_asm(a, a);
+            x2_1[i] = mul_asm(b, b);
+        }
+        let mut x3_0 = [0u64; WIDTH];
+        let mut x3_1 = [0u64; WIDTH];
+        let mut x4_0 = [0u64; WIDTH];
+        let mut x4_1 = [0u64; WIDTH];
+        for i in 0..WIDTH {
+            let a = vgetq_lane_u64::<0>(state[i]);
+            let b = vgetq_lane_u64::<1>(state[i]);
+            x3_0[i] = mul_asm(x2_0[i], a);
+            x3_1[i] = mul_asm(x2_1[i], b);
+            x4_0[i] = mul_asm(x2_0[i], x2_0[i]);
+            x4_1[i] = mul_asm(x2_1[i], x2_1[i]);
+        }
+        for i in 0..WIDTH {
+            let r0 = mul_asm(x3_0[i], x4_0[i]);
+            let r1 = mul_asm(x3_1[i], x4_1[i]);
+            state[i] = vcombine_u64(vcreate_u64(r0), vcreate_u64(r1));
+        }
+    }
+}
+
+#[inline(always)]
+unsafe fn external_round_neon<const WIDTH: usize>(
+    state: &mut [uint64x2_t; WIDTH],
+    rc: &[u64; WIDTH],
+) {
+    unsafe {
+        for i in 0..WIDTH {
+            let rc_vec = vdupq_n_u64(rc[i]);
+            state[i] = add_neon(state[i], rc_vec);
+        }
+        sbox_neon(state);
+        mds_light_neon(state);
+    }
+}
+
+/// NEON initial external permute.
+#[inline]
+pub fn external_initial_neon<const WIDTH: usize>(
+    state: &mut [uint64x2_t; WIDTH],
+    constants: &[[u64; WIDTH]],
+) {
+    unsafe {
+        mds_light_neon(state);
+    }
+    for rc in constants {
+        unsafe {
+            external_round_neon(state, rc);
+        }
+    }
+}
+
+/// NEON terminal external permute.
+#[inline]
+pub fn external_terminal_neon<const WIDTH: usize>(
+    state: &mut [uint64x2_t; WIDTH],
+    constants: &[[u64; WIDTH]],
+) {
+    for rc in constants {
+        unsafe {
+            external_round_neon(state, rc);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use alloc::vec::Vec;
+
+    use p3_field::{PrimeCharacteristicRing, PrimeField64};
+    use p3_poseidon2::{MDSMat4, matmul_internal, mds_light_permutation};
+    use proptest::prelude::*;
+    use rand::rngs::SmallRng;
+    use rand::{RngExt, SeedableRng};
+
+    use super::*;
+    use crate::{
+        Goldilocks, MATRIX_DIAG_8_GOLDILOCKS, MATRIX_DIAG_12_GOLDILOCKS, MATRIX_DIAG_16_GOLDILOCKS,
+        MATRIX_DIAG_20_GOLDILOCKS,
+    };
+
+    type F = Goldilocks;
+
+    /// Reduce a raw u64 to its canonical Goldilocks representative.
+    fn canon(x: u64) -> u64 {
+        F::new(x).as_canonical_u64()
+    }
+
+    /// Pack two u64 lanes into a single NEON vector.
+    unsafe fn make_neon(a: u64, b: u64) -> uint64x2_t {
+        unsafe { vcombine_u64(vcreate_u64(a), vcreate_u64(b)) }
+    }
+
+    /// Extract both u64 lanes from a NEON vector.
+    unsafe fn read_neon(v: uint64x2_t) -> (u64, u64) {
+        unsafe { (vgetq_lane_u64::<0>(v), vgetq_lane_u64::<1>(v)) }
+    }
+
+    proptest! {
+        #[test]
+        fn test_sub_asm(a: u64, b: u64) {
+            // Compute a - b using the standard field implementation.
+            let expected = (F::new(a) - F::new(b)).as_canonical_u64();
+
+            // The ASM version should give the same canonical result.
+            let got = canon(unsafe { sub_asm(a, b) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_double_asm(a: u64) {
+            // Doubling is just a + a in the field.
+            let expected = (F::new(a) + F::new(a)).as_canonical_u64();
+
+            // The ASM shortcut should match.
+            let got = canon(unsafe { double_asm(a) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_div2_asm(x: u64) {
+            // Dividing by 2 is one halving in the field.
+            let expected = F::new(x).halve().as_canonical_u64();
+
+            let got = canon(unsafe { div2_asm(x) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_div4_asm(x: u64) {
+            // Dividing by 4 is two halvings.
+            let expected = F::new(x).halve().halve().as_canonical_u64();
+
+            let got = canon(unsafe { div4_asm(x) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_div8_asm(x: u64) {
+            // Dividing by 8 is three halvings.
+            let expected = F::new(x).halve().halve().halve().as_canonical_u64();
+
+            let got = canon(unsafe { div8_asm(x) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_div16_asm(x: u64) {
+            // Dividing by 16 is four halvings.
+            let expected = F::new(x).halve().halve().halve().halve().as_canonical_u64();
+
+            let got = canon(unsafe { div16_asm(x) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_div32_asm(x: u64) {
+            // Dividing by 32 is five halvings.
+            let expected = F::new(x)
+                .halve().halve().halve().halve().halve()
+                .as_canonical_u64();
+
+            let got = canon(unsafe { div32_asm(x) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_div_2_32_asm(x: u64) {
+            // Dividing by 2^32: apply halve 32 times as reference.
+            let mut v = F::new(x);
+            for _ in 0..32 {
+                v = v.halve();
+            }
+            let expected = v.as_canonical_u64();
+
+            let got = canon(unsafe { div_2_32_asm(x) });
+            prop_assert_eq!(got, expected);
+        }
+
+        #[test]
+        fn test_apply_mat4_asm(x0: u64, x1: u64, x2: u64, x3: u64) {
+            // Build field elements from the raw inputs.
+            let f = [F::new(x0), F::new(x1), F::new(x2), F::new(x3)];
+
+            // The [2,3,1,1] circulant matrix rows.
+            let two = F::TWO;
+            let three = two + F::ONE;
+            let e0 = two * f[0] + three * f[1] + f[2] + f[3];
+            let e1 = f[0] + two * f[1] + three * f[2] + f[3];
+            let e2 = f[0] + f[1] + two * f[2] + three * f[3];
+            let e3 = three * f[0] + f[1] + f[2] + two * f[3];
+
+            // Run the ASM version on raw u64s.
+            let mut state = [x0, x1, x2, x3];
+            unsafe { apply_mat4_asm(&mut state); }
+
+            // Each slot must match the field-level reference.
+            prop_assert_eq!(canon(state[0]), e0.as_canonical_u64());
+            prop_assert_eq!(canon(state[1]), e1.as_canonical_u64());
+            prop_assert_eq!(canon(state[2]), e2.as_canonical_u64());
+            prop_assert_eq!(canon(state[3]), e3.as_canonical_u64());
+        }
+
+        #[test]
+        fn test_mds_light_permutation_asm_w8(vals in prop::array::uniform8(any::<u64>())) {
+            // Build field-level state and apply the generic MDS.
+            let mut state_generic: [F; 8] = vals.map(F::new);
+            mds_light_permutation(&mut state_generic, &MDSMat4);
+
+            // Run the ASM version on the same raw values.
+            let mut state_asm = vals;
+            unsafe { mds_light_permutation_asm(&mut state_asm); }
+
+            // Every element must agree.
+            for i in 0..8 {
+                prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64());
+            }
+        }
+
+        #[test]
+        fn test_mds_light_permutation_asm_w12(vals in prop::array::uniform12(any::<u64>())) {
+            let mut state_generic: [F; 12] = vals.map(F::new);
+            mds_light_permutation(&mut state_generic, &MDSMat4);
+
+            let mut state_asm = vals;
+            unsafe { mds_light_permutation_asm(&mut state_asm); }
+
+            for i in 0..12 {
+                prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64());
+            }
+        }
+
+        #[test]
+        fn test_mds_light_permutation_asm_w16(vals in prop::array::uniform16(any::<u64>())) {
+            let mut state_generic: [F; 16] = vals.map(F::new);
+            mds_light_permutation(&mut state_generic, &MDSMat4);
+
+            let mut state_asm = vals;
+            unsafe { mds_light_permutation_asm(&mut state_asm); }
+
+            for i in 0..16 {
+                prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64());
+            }
+        }
+
+        #[test]
+        fn test_sbox_layer_asm(vals in prop::array::uniform8(any::<u64>())) {
+            // Apply the ASM S-box to a copy of the input.
+            let mut state = vals;
+            unsafe { sbox_layer_asm(&mut state); }
+
+            // Verify each element is x^7 = x^3 * x^4.
+            for i in 0..8 {
+                let x = F::new(vals[i]);
+                let x2 = x * x;
+                let x3 = x2 * x;
+                let x4 = x2 * x2;
+                let x7 = x3 * x4;
+                prop_assert_eq!(canon(state[i]), x7.as_canonical_u64());
+            }
+        }
+
+        #[test]
+        fn test_external_round_asm(
+            vals in prop::array::uniform8(any::<u64>()),
+            rc in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Build reference: add round constants, apply x^7, then MDS.
+            let mut expected: [F; 8] = core::array::from_fn(|i| F::new(vals[i]) + F::new(rc[i]));
+            for x in expected.iter_mut() {
+                let x2 = *x * *x;
+                let x3 = x2 * *x;
+                let x4 = x2 * x2;
+                *x = x3 * x4;
+            }
+            mds_light_permutation(&mut expected, &MDSMat4);
+
+            // Run the ASM external round.
+            let mut state = vals;
+            unsafe { external_round_asm(&mut state, &rc); }
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
+            }
+        }
+
+        #[test]
+        fn test_sbox_layer_dual_asm(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Run sbox on each lane independently as reference.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                sbox_layer_asm(&mut ref0);
+                sbox_layer_asm(&mut ref1);
+            }
+
+            // The dual-lane version processes both at once.
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { sbox_layer_dual_asm(&mut s0, &mut s1); }
+
+            // Both lanes must match their single-lane reference.
+            for i in 0..8 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_round_dual_asm(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+            rc in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Run external round on each lane independently as reference.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                external_round_asm(&mut ref0, &rc);
+                external_round_asm(&mut ref1, &rc);
+            }
+
+            // The dual-lane version processes both at once.
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { external_round_dual_asm(&mut s0, &mut s1, &rc); }
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_round_fused_w8(
+            vals in prop::array::uniform8(any::<u64>()),
+            rc in prop::array::uniform8(any::<u64>()),
+        ) {
+            // The generic external round is the reference.
+            let mut ref_state = vals;
+            unsafe { external_round_asm(&mut ref_state, &rc); }
+
+            // The fused W8 version should produce the same output.
+            let mut fused_state = vals;
+            unsafe { external_round_fused_w8(&mut fused_state, &rc); }
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(fused_state[i]), canon(ref_state[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_round_fused_dual_w8(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+            rc in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Run the fused round on each lane independently as reference.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            unsafe {
+                external_round_fused_w8(&mut ref0, &rc);
+                external_round_fused_w8(&mut ref1, &rc);
+            }
+
+            // The dual version processes both at once.
+            let mut s0 = vals0;
+            let mut s1 = vals1;
+            unsafe { external_round_fused_dual_w8(&mut s0, &mut s1, &rc); }
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
+            }
+        }
+    }
+
+    fn test_internal_round_matches<const WIDTH: usize>(diag: [F; WIDTH]) {
+        let mut rng = SmallRng::seed_from_u64(12345);
+
+        // Build random state and constants.
+        let mut state_asm: [F; WIDTH] = rng.random();
+        let mut state_generic = state_asm;
+
+        let internal_constants: [F; 22] = rng.random();
+        let constants_raw: Vec<u64> = internal_constants.iter().map(|c| c.value).collect();
+        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
+
+        // Run the ASM internal permute on raw u64 representation.
+        let state_raw: &mut [u64; WIDTH] =
+            unsafe { &mut *(&mut state_asm as *mut [F; WIDTH] as *mut [u64; WIDTH]) };
+        internal_permute_state_asm(state_raw, &diag_raw, &constants_raw);
+
+        // Build the same result via field-level ops: add RC, S-box on s0, matmul.
+        for &rc in internal_constants.iter() {
+            state_generic[0] += rc;
+            let s = state_generic[0];
+            let s2 = s * s;
+            let s3 = s2 * s;
+            let s4 = s2 * s2;
+            state_generic[0] = s3 * s4;
+            matmul_internal(&mut state_generic, diag);
+        }
+
+        for i in 0..WIDTH {
+            assert_eq!(
+                state_asm[i].as_canonical_u64(),
+                state_generic[i].as_canonical_u64(),
+                "mismatch at index {i}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_internal_round_width_8() {
+        test_internal_round_matches(MATRIX_DIAG_8_GOLDILOCKS);
+    }
+
+    #[test]
+    fn test_internal_round_width_12() {
+        test_internal_round_matches(MATRIX_DIAG_12_GOLDILOCKS);
+    }
+
+    #[test]
+    fn test_internal_round_width_16() {
+        test_internal_round_matches(MATRIX_DIAG_16_GOLDILOCKS);
+    }
+
+    #[test]
+    fn test_internal_round_width_20() {
+        test_internal_round_matches(MATRIX_DIAG_20_GOLDILOCKS);
+    }
+
+    fn test_specialized_matches_generic<const WIDTH: usize>(
+        diag: [F; WIDTH],
+        specialized_fn: fn(&mut [u64; WIDTH], &[u64]),
+    ) {
+        let mut rng = SmallRng::seed_from_u64(42);
+
+        let internal_constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
+        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
+
+        // Run both the specialized and generic versions on several random states.
+        for _ in 0..8 {
+            let mut state_specialized: [u64; WIDTH] = rng.random();
+            let mut state_generic = state_specialized;
+
+            specialized_fn(&mut state_specialized, &internal_constants);
+            internal_permute_state_asm(&mut state_generic, &diag_raw, &internal_constants);
+
+            for i in 0..WIDTH {
+                assert_eq!(canon(state_specialized[i]), canon(state_generic[i]));
+            }
+        }
+    }
+
+    #[test]
+    fn test_specialized_w8_matches_generic() {
+        test_specialized_matches_generic(MATRIX_DIAG_8_GOLDILOCKS, internal_permute_state_asm_w8);
+    }
+
+    #[test]
+    fn test_specialized_w12_matches_generic() {
+        test_specialized_matches_generic(MATRIX_DIAG_12_GOLDILOCKS, internal_permute_state_asm_w12);
+    }
+
+    #[test]
+    fn test_specialized_w16_matches_generic() {
+        test_specialized_matches_generic(MATRIX_DIAG_16_GOLDILOCKS, internal_permute_state_asm_w16);
+    }
+
+    #[allow(clippy::type_complexity)]
+    fn test_dual_matches_single<const WIDTH: usize>(
+        diag: [F; WIDTH],
+        single_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]),
+        dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64; WIDTH], &[u64]),
+    ) {
+        let mut rng = SmallRng::seed_from_u64(77);
+
+        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
+        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
+
+        // Run single-lane on each lane independently.
+        let mut lane0: [u64; WIDTH] = rng.random();
+        let mut lane1: [u64; WIDTH] = rng.random();
+        let mut ref0 = lane0;
+        let mut ref1 = lane1;
+
+        single_fn(&mut ref0, &diag_raw, &constants);
+        single_fn(&mut ref1, &diag_raw, &constants);
+
+        // Run dual-lane on both at once. Must match.
+        dual_fn(&mut lane0, &mut lane1, &diag_raw, &constants);
+
+        for i in 0..WIDTH {
+            assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}");
+            assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}");
+        }
+    }
+
+    #[test]
+    fn test_internal_permute_split_dual_w8() {
+        test_dual_matches_single(
+            MATRIX_DIAG_8_GOLDILOCKS,
+            internal_permute_state_asm,
+            internal_permute_split_dual,
+        );
+    }
+
+    #[test]
+    fn test_internal_permute_split_dual_w12() {
+        test_dual_matches_single(
+            MATRIX_DIAG_12_GOLDILOCKS,
+            internal_permute_state_asm,
+            internal_permute_split_dual,
+        );
+    }
+
+    #[test]
+    fn test_internal_permute_split_dual_w16() {
+        test_dual_matches_single(
+            MATRIX_DIAG_16_GOLDILOCKS,
+            internal_permute_state_asm,
+            internal_permute_split_dual,
+        );
+    }
+
+    fn test_specialized_dual_matches_generic_dual<const WIDTH: usize>(
+        diag: [F; WIDTH],
+        specialized_dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64]),
+    ) {
+        let mut rng = SmallRng::seed_from_u64(99);
+
+        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
+        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
+
+        // The generic dual-lane version is the reference.
+        let mut lane0: [u64; WIDTH] = rng.random();
+        let mut lane1: [u64; WIDTH] = rng.random();
+        let mut ref0 = lane0;
+        let mut ref1 = lane1;
+
+        internal_permute_split_dual(&mut ref0, &mut ref1, &diag_raw, &constants);
+
+        // The specialized version must match.
+        specialized_dual_fn(&mut lane0, &mut lane1, &constants);
+
+        for i in 0..WIDTH {
+            assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}");
+            assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}");
+        }
+    }
+
+    #[test]
+    fn test_specialized_dual_w8_matches_generic() {
+        test_specialized_dual_matches_generic_dual(
+            MATRIX_DIAG_8_GOLDILOCKS,
+            internal_permute_split_dual_w8,
+        );
+    }
+
+    #[test]
+    fn test_specialized_dual_w12_matches_generic() {
+        test_specialized_dual_matches_generic_dual(
+            MATRIX_DIAG_12_GOLDILOCKS,
+            internal_permute_split_dual_w12,
+        );
+    }
+
+    #[test]
+    fn test_specialized_dual_w16_matches_generic() {
+        test_specialized_dual_matches_generic_dual(
+            MATRIX_DIAG_16_GOLDILOCKS,
+            internal_permute_split_dual_w16,
+        );
+    }
+
+    fn make_round_constants<const WIDTH: usize>(seed: u64, num_rounds: usize) -> Vec<[u64; WIDTH]> {
+        let mut rng = SmallRng::seed_from_u64(seed);
+        (0..num_rounds).map(|_| rng.random()).collect()
+    }
+
+    proptest! {
+        #[test]
+        fn test_external_initial_permute_state_asm(
+            vals in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(42, 4);
+
+            // Reference: apply MDS once, then each external round manually.
+            let mut expected = vals;
+            unsafe { mds_light_permutation_asm(&mut expected); }
+            for rc in &constants {
+                unsafe { external_round_asm(&mut expected, rc); }
+            }
+
+            // The composed function should give the same result.
+            let mut got = vals;
+            external_initial_permute_state_asm(&mut got, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(got[i]), canon(expected[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_terminal_permute_state_asm(
+            vals in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(43, 4);
+
+            // Reference: just the external rounds, no initial MDS.
+            let mut expected = vals;
+            for rc in &constants {
+                unsafe { external_round_asm(&mut expected, rc); }
+            }
+
+            let mut got = vals;
+            external_terminal_permute_state_asm(&mut got, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(got[i]), canon(expected[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_initial_permute_w8(
+            vals in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(44, 4);
+
+            // The generic version is the reference.
+            let mut expected = vals;
+            external_initial_permute_state_asm(&mut expected, &constants);
+
+            // The W8-specialized version must match.
+            let mut got = vals;
+            external_initial_permute_w8(&mut got, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(got[i]), canon(expected[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_terminal_permute_w8(
+            vals in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(45, 4);
+
+            let mut expected = vals;
+            external_terminal_permute_state_asm(&mut expected, &constants);
+
+            let mut got = vals;
+            external_terminal_permute_w8(&mut got, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(got[i]), canon(expected[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_initial_permute_dual(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(46, 4);
+
+            // Run single-lane on each independently as reference.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            external_initial_permute_state_asm(&mut ref0, &constants);
+            external_initial_permute_state_asm(&mut ref1, &constants);
+
+            // The dual version processes both at once.
+            let mut l0 = vals0;
+            let mut l1 = vals1;
+            external_initial_permute_dual(&mut l0, &mut l1, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_terminal_permute_dual(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(47, 4);
+
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            external_terminal_permute_state_asm(&mut ref0, &constants);
+            external_terminal_permute_state_asm(&mut ref1, &constants);
+
+            let mut l0 = vals0;
+            let mut l1 = vals1;
+            external_terminal_permute_dual(&mut l0, &mut l1, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_initial_permute_dual_w8(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(48, 4);
+
+            // The generic dual version is the reference.
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            external_initial_permute_dual(&mut ref0, &mut ref1, &constants);
+
+            // The W8-specialized dual must match.
+            let mut l0 = vals0;
+            let mut l1 = vals1;
+            external_initial_permute_dual_w8(&mut l0, &mut l1, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_terminal_permute_dual_w8(
+            vals0 in prop::array::uniform8(any::<u64>()),
+            vals1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(49, 4);
+
+            let mut ref0 = vals0;
+            let mut ref1 = vals1;
+            external_terminal_permute_dual(&mut ref0, &mut ref1, &constants);
+
+            let mut l0 = vals0;
+            let mut l1 = vals1;
+            external_terminal_permute_dual_w8(&mut l0, &mut l1, &constants);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
+                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
+            }
+        }
+
+        #[test]
+        fn test_add_neon(a0: u64, a1: u64, b0: u64, b1: u64) {
+            unsafe {
+                // Pack two lanes into NEON vectors, add, then read back.
+                let (r0, r1) = read_neon(add_neon(make_neon(a0, a1), make_neon(b0, b1)));
+
+                // Each lane must match its scalar add_asm equivalent.
+                prop_assert_eq!(canon(r0), canon(add_asm(a0, b0)));
+                prop_assert_eq!(canon(r1), canon(add_asm(a1, b1)));
+            }
+        }
+
+        #[test]
+        fn test_sub_neon(a0: u64, a1: u64, b0: u64, b1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(sub_neon(make_neon(a0, a1), make_neon(b0, b1)));
+
+                prop_assert_eq!(canon(r0), canon(sub_asm(a0, b0)));
+                prop_assert_eq!(canon(r1), canon(sub_asm(a1, b1)));
+            }
+        }
+
+        #[test]
+        fn test_double_neon(a0: u64, a1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(double_neon(make_neon(a0, a1)));
+
+                prop_assert_eq!(canon(r0), canon(double_asm(a0)));
+                prop_assert_eq!(canon(r1), canon(double_asm(a1)));
+            }
+        }
+
+        #[test]
+        fn test_div2_neon(a0: u64, a1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(div2_neon(make_neon(a0, a1)));
+
+                prop_assert_eq!(canon(r0), canon(div2_asm(a0)));
+                prop_assert_eq!(canon(r1), canon(div2_asm(a1)));
+            }
+        }
+
+        #[test]
+        fn test_div4_neon(a0: u64, a1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(div4_neon(make_neon(a0, a1)));
+
+                prop_assert_eq!(canon(r0), canon(div4_asm(a0)));
+                prop_assert_eq!(canon(r1), canon(div4_asm(a1)));
+            }
+        }
+
+        #[test]
+        fn test_div8_neon(a0: u64, a1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(div8_neon(make_neon(a0, a1)));
+
+                prop_assert_eq!(canon(r0), canon(div8_asm(a0)));
+                prop_assert_eq!(canon(r1), canon(div8_asm(a1)));
+            }
+        }
+
+        #[test]
+        fn test_div16_neon(a0: u64, a1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(div16_neon(make_neon(a0, a1)));
+
+                prop_assert_eq!(canon(r0), canon(div16_asm(a0)));
+                prop_assert_eq!(canon(r1), canon(div16_asm(a1)));
+            }
+        }
+
+        #[test]
+        fn test_div32_neon(a0: u64, a1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(div32_neon(make_neon(a0, a1)));
+
+                prop_assert_eq!(canon(r0), canon(div32_asm(a0)));
+                prop_assert_eq!(canon(r1), canon(div32_asm(a1)));
+            }
+        }
+
+        #[test]
+        fn test_div_2_32_neon(a0: u64, a1: u64) {
+            unsafe {
+                let (r0, r1) = read_neon(div_2_32_neon(make_neon(a0, a1)));
+
+                prop_assert_eq!(canon(r0), canon(div_2_32_asm(a0)));
+                prop_assert_eq!(canon(r1), canon(div_2_32_asm(a1)));
+            }
+        }
+
+        #[test]
+        fn test_apply_mat4_neon(
+            a0: u64, a1: u64, a2: u64, a3: u64,
+            b0: u64, b1: u64, b2: u64, b3: u64,
+        ) {
+            unsafe {
+                // Scalar reference: run apply_mat4_asm on each lane separately.
+                let mut lane_a = [a0, a1, a2, a3];
+                let mut lane_b = [b0, b1, b2, b3];
+                apply_mat4_asm(&mut lane_a);
+                apply_mat4_asm(&mut lane_b);
+
+                // NEON version: pack both lanes into vectors, apply, read back.
+                let mut neon_state = [
+                    make_neon(a0, b0),
+                    make_neon(a1, b1),
+                    make_neon(a2, b2),
+                    make_neon(a3, b3),
+                ];
+                apply_mat4_neon(&mut neon_state);
+
+                for i in 0..4 {
+                    let (r0, r1) = read_neon(neon_state[i]);
+                    prop_assert_eq!(canon(r0), canon(lane_a[i]));
+                    prop_assert_eq!(canon(r1), canon(lane_b[i]));
+                }
+            }
+        }
+
+        #[test]
+        fn test_mds_light_neon_w8(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            unsafe {
+                // Run scalar MDS on each lane independently.
+                let mut ref_a = lane_a;
+                let mut ref_b = lane_b;
+                mds_light_permutation_asm(&mut ref_a);
+                mds_light_permutation_asm(&mut ref_b);
+
+                // Pack both lanes into NEON vectors and run the NEON MDS.
+                let mut neon_state: [uint64x2_t; 8] =
+                    core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i]));
+                mds_light_neon(&mut neon_state);
+
+                // Each lane of each vector must match the scalar reference.
+                for i in 0..8 {
+                    let (r0, r1) = read_neon(neon_state[i]);
+                    prop_assert_eq!(canon(r0), canon(ref_a[i]));
+                    prop_assert_eq!(canon(r1), canon(ref_b[i]));
+                }
+            }
+        }
+
+        #[test]
+        fn test_sbox_neon(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            unsafe {
+                // Scalar reference on each lane.
+                let mut ref_a = lane_a;
+                let mut ref_b = lane_b;
+                sbox_layer_asm(&mut ref_a);
+                sbox_layer_asm(&mut ref_b);
+
+                // NEON version on packed lanes.
+                let mut neon_state: [uint64x2_t; 8] =
+                    core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i]));
+                sbox_neon(&mut neon_state);
+
+                for i in 0..8 {
+                    let (r0, r1) = read_neon(neon_state[i]);
+                    prop_assert_eq!(canon(r0), canon(ref_a[i]));
+                    prop_assert_eq!(canon(r1), canon(ref_b[i]));
+                }
+            }
+        }
+
+        #[test]
+        fn test_external_round_neon(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+            rc in prop::array::uniform8(any::<u64>()),
+        ) {
+            unsafe {
+                // Scalar reference on each lane.
+                let mut ref_a = lane_a;
+                let mut ref_b = lane_b;
+                external_round_asm(&mut ref_a, &rc);
+                external_round_asm(&mut ref_b, &rc);
+
+                // NEON version on packed lanes.
+                let mut neon_state: [uint64x2_t; 8] =
+                    core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i]));
+                external_round_neon(&mut neon_state, &rc);
+
+                for i in 0..8 {
+                    let (r0, r1) = read_neon(neon_state[i]);
+                    prop_assert_eq!(canon(r0), canon(ref_a[i]));
+                    prop_assert_eq!(canon(r1), canon(ref_b[i]));
+                }
+            }
+        }
+
+        #[test]
+        fn test_lanes_roundtrip(
+            lane0 in prop::array::uniform8(any::<u64>()),
+            lane1 in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Pack two lane arrays into NEON vectors.
+            let packed = lanes_to_neon(&lane0, &lane1);
+
+            // Unpack back into separate arrays.
+            let mut out0 = [0u64; 8];
+            let mut out1 = [0u64; 8];
+            neon_to_lanes(&packed, &mut out0, &mut out1);
+
+            // Must recover the original values.
+            prop_assert_eq!(out0, lane0);
+            prop_assert_eq!(out1, lane1);
+        }
+
+        #[test]
+        fn test_external_initial_neon(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(50, 4);
+
+            // Scalar reference on each lane.
+            let mut ref_a = lane_a;
+            let mut ref_b = lane_b;
+            external_initial_permute_state_asm(&mut ref_a, &constants);
+            external_initial_permute_state_asm(&mut ref_b, &constants);
+
+            // NEON version on packed lanes.
+            let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
+            external_initial_neon(&mut neon_state, &constants);
+
+            let mut out_a = [0u64; 8];
+            let mut out_b = [0u64; 8];
+            neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(out_a[i]), canon(ref_a[i]));
+                prop_assert_eq!(canon(out_b[i]), canon(ref_b[i]));
+            }
+        }
+
+        #[test]
+        fn test_external_terminal_neon(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            let constants = make_round_constants::<8>(51, 4);
+
+            let mut ref_a = lane_a;
+            let mut ref_b = lane_b;
+            external_terminal_permute_state_asm(&mut ref_a, &constants);
+            external_terminal_permute_state_asm(&mut ref_b, &constants);
+
+            let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
+            external_terminal_neon(&mut neon_state, &constants);
+
+            let mut out_a = [0u64; 8];
+            let mut out_b = [0u64; 8];
+            neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
+
+            for i in 0..8 {
+                prop_assert_eq!(canon(out_a[i]), canon(ref_a[i]));
+                prop_assert_eq!(canon(out_b[i]), canon(ref_b[i]));
+            }
+        }
+    }
+
+    fn test_internal_neon_matches_scalar<const WIDTH: usize>(
+        diag: [F; WIDTH],
+        neon_fn: fn(&mut [uint64x2_t; WIDTH], &[u64]),
+        scalar_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]),
+    ) {
+        let mut rng = SmallRng::seed_from_u64(55);
+
+        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
+        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
+
+        let lane_a: [u64; WIDTH] = rng.random();
+        let lane_b: [u64; WIDTH] = rng.random();
+
+        // Scalar reference on each lane independently.
+        let mut ref_a = lane_a;
+        let mut ref_b = lane_b;
+        scalar_fn(&mut ref_a, &diag_raw, &constants);
+        scalar_fn(&mut ref_b, &diag_raw, &constants);
+
+        // NEON version packs both lanes and processes them together.
+        let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
+        neon_fn(&mut neon_state, &constants);
+
+        let mut out_a = [0u64; WIDTH];
+        let mut out_b = [0u64; WIDTH];
+        neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
+
+        for i in 0..WIDTH {
+            assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}");
+            assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}");
+        }
+    }
+
+    #[test]
+    fn test_internal_permute_neon_w12() {
+        test_internal_neon_matches_scalar(
+            MATRIX_DIAG_12_GOLDILOCKS,
+            internal_permute_neon_w12,
+            internal_permute_state_asm,
+        );
+    }
+
+    #[test]
+    fn test_internal_permute_neon_w16() {
+        test_internal_neon_matches_scalar(
+            MATRIX_DIAG_16_GOLDILOCKS,
+            internal_permute_neon_w16,
+            internal_permute_state_asm,
+        );
+    }
+
+    fn test_internal_neon_generic_matches_scalar<const WIDTH: usize>(diag: [F; WIDTH]) {
+        let mut rng = SmallRng::seed_from_u64(66);
+
+        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
+        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
+
+        let lane_a: [u64; WIDTH] = rng.random();
+        let lane_b: [u64; WIDTH] = rng.random();
+
+        // Scalar reference.
+        let mut ref_a = lane_a;
+        let mut ref_b = lane_b;
+        internal_permute_state_asm(&mut ref_a, &diag_raw, &constants);
+        internal_permute_state_asm(&mut ref_b, &diag_raw, &constants);
+
+        // Generic NEON version.
+        let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
+        internal_permute_neon(&mut neon_state, &diag_raw, &constants);
+
+        let mut out_a = [0u64; WIDTH];
+        let mut out_b = [0u64; WIDTH];
+        neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
+
+        for i in 0..WIDTH {
+            assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}");
+            assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}");
+        }
+    }
+
+    #[test]
+    fn test_internal_permute_neon_generic_w8() {
+        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_8_GOLDILOCKS);
+    }
+
+    #[test]
+    fn test_internal_permute_neon_generic_w12() {
+        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_12_GOLDILOCKS);
+    }
+
+    #[test]
+    fn test_internal_permute_neon_generic_w16() {
+        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_16_GOLDILOCKS);
+    }
+
+    #[test]
+    fn test_internal_permute_neon_generic_w20() {
+        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_20_GOLDILOCKS);
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs
new file mode 100644
index 000000000..3d1951a57
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs
@@ -0,0 +1,400 @@
+//! Shared utilities for Goldilocks NEON assembly.
+
+use core::arch::asm;
+
+use super::packing::PackedGoldilocksNeon;
+use crate::{Goldilocks, P};
+
+const EPSILON: u64 = P.wrapping_neg(); // 2^32 - 1
+
+// ---------------------------------------------------------------------------
+// Scalar field arithmetic (inline assembly)
+// ---------------------------------------------------------------------------
+
+/// Multiply two Goldilocks elements using inline assembly.
+///
+/// Computes `a * b mod P` where P = 2^64 - 2^32 + 1. The reduction
+/// uses the identity `2^64 = 2^32 - 1 (mod P)` (i.e. EPSILON) to fold
+/// the 128-bit product back into a single limb.
+#[inline(always)]
+pub(super) unsafe fn mul_asm(a: u64, b: u64) -> u64 {
+    let _lo: u64;
+    let _hi: u64;
+    let _t0: u64;
+    let _t1: u64;
+    let _t2: u64;
+    let result: u64;
+
+    unsafe {
+        asm!(
+            // Compute 128-bit product: hi:lo = a * b
+            "mul   {lo}, {a}, {b}",
+            "umulh {hi}, {a}, {b}",
+
+            // Reduce: result = lo - hi_hi + hi_lo * EPSILON
+            // where hi = hi_hi * 2^32 + hi_lo
+
+            // t0 = lo - (hi >> 32), with borrow detection
+            "lsr   {t0}, {hi}, #32",          // t0 = hi >> 32
+            "subs  {t1}, {lo}, {t0}",         // t1 = lo - t0, set flags
+            "csetm {t2:w}, cc",               // t2 = -1 if borrow, 0 otherwise
+            "sub   {t1}, {t1}, {t2}",         // Adjust for borrow (subtract EPSILON)
+
+            // t0 = (hi & EPSILON) * EPSILON
+            "and   {t0}, {hi}, {epsilon}",    // t0 = hi & EPSILON
+            "mul   {t0}, {t0}, {epsilon}",    // t0 = t0 * EPSILON
+
+            // result = t1 + t0, with overflow detection
+            "adds  {result}, {t1}, {t0}",     // result = t1 + t0, set flags
+            "csetm {t2:w}, cs",               // t2 = -1 if carry, 0 otherwise
+            "add   {result}, {result}, {t2}", // Add EPSILON on overflow
+
+            a = in(reg) a,
+            b = in(reg) b,
+            epsilon = in(reg) EPSILON,
+            lo = out(reg) _lo,
+            hi = out(reg) _hi,
+            t0 = out(reg) _t0,
+            t1 = out(reg) _t1,
+            t2 = out(reg) _t2,
+            result = out(reg) result,
+            options(pure, nomem, nostack),
+        );
+    }
+
+    result
+}
+
+/// Compute `a * b + c` in the Goldilocks field using inline assembly.
+///
+/// Fused multiply-add: forms the 128-bit product `a * b`, adds `c` into
+/// the low limb (with carry propagation), then reduces modulo P.
+#[inline(always)]
+pub(super) unsafe fn mul_add_asm(a: u64, b: u64, c: u64) -> u64 {
+    let _lo: u64;
+    let _hi: u64;
+    let _t0: u64;
+    let _t1: u64;
+    let _t2: u64;
+    let result: u64;
+
+    unsafe {
+        asm!(
+            // Compute 128-bit product: hi:lo = a * b
+            "mul   {lo}, {a}, {b}",
+            "umulh {hi}, {a}, {b}",
+
+            // Accumulate c into the 128-bit product: hi:lo = hi:lo + c
+            "adds  {lo}, {lo}, {c}",
+            "adc   {hi}, {hi}, xzr",
+
+            // Reduce: result = lo - hi_hi + hi_lo * EPSILON
+            // where hi = hi_hi * 2^32 + hi_lo
+
+            // t0 = lo - (hi >> 32), with borrow detection
+            "lsr   {t0}, {hi}, #32",          // t0 = hi >> 32
+            "subs  {t1}, {lo}, {t0}",         // t1 = lo - t0, set flags
+            "csetm {t2:w}, cc",               // t2 = -1 if borrow, 0 otherwise
+            "sub   {t1}, {t1}, {t2}",         // Adjust for borrow (subtract EPSILON)
+
+            // t0 = (hi & EPSILON) * EPSILON
+            "and   {t0}, {hi}, {epsilon}",    // t0 = hi & EPSILON
+            "mul   {t0}, {t0}, {epsilon}",    // t0 = t0 * EPSILON
+
+            // result = t1 + t0, with overflow detection
+            "adds  {result}, {t1}, {t0}",     // result = t1 + t0, set flags
+            "csetm {t2:w}, cs",               // t2 = -1 if carry, 0 otherwise
+            "add   {result}, {result}, {t2}", // Add EPSILON on overflow
+
+            a = in(reg) a,
+            b = in(reg) b,
+            c = in(reg) c,
+            epsilon = in(reg) EPSILON,
+            lo = out(reg) _lo,
+            hi = out(reg) _hi,
+            t0 = out(reg) _t0,
+            t1 = out(reg) _t1,
+            t2 = out(reg) _t2,
+            result = out(reg) result,
+            options(pure, nomem, nostack),
+        );
+    }
+
+    result
+}
+
+/// Add two Goldilocks elements with overflow handling using inline assembly.
+///
+/// Computes `a + b mod P`. On overflow (carry out of 64 bits), subtracts
+/// P by adding EPSILON (which equals -P mod 2^64, i.e. 2^32 - 1).
+#[inline(always)]
+pub(super) unsafe fn add_asm(a: u64, b: u64) -> u64 {
+    let result: u64;
+    let _adj: u64;
+
+    unsafe {
+        asm!(
+            "adds  {result}, {a}, {b}",
+            "csetm {adj:w}, cs",
+            "add   {result}, {result}, {adj}",
+            a = in(reg) a,
+            b = in(reg) b,
+            result = out(reg) result,
+            adj = out(reg) _adj,
+            options(pure, nomem, nostack),
+        );
+    }
+
+    result
+}
+
+// ---------------------------------------------------------------------------
+// Lane conversion (packed NEON <-> raw u64 arrays)
+// ---------------------------------------------------------------------------
+
+/// Unpack a packed NEON state into two raw `u64` lane arrays.
+///
+/// Each packed slot contains two Goldilocks elements (lane 0, lane 1).
+/// This function extracts the internal `u64` representation of each
+/// element into two separate arrays, one per lane.
+///
+/// # Layout
+///
+/// ```text
+///     packed[i] = (field_elem_a, field_elem_b)
+///
+///     lane0[i] = field_elem_a.value    (raw u64)
+///     lane1[i] = field_elem_b.value    (raw u64)
+/// ```
+#[inline]
+pub(super) fn unpack_lanes<const WIDTH: usize>(
+    state: &[PackedGoldilocksNeon; WIDTH],
+) -> ([u64; WIDTH], [u64; WIDTH]) {
+    // Extract the raw u64 representation from each packed slot.
+    let lane0: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[0].value);
+    let lane1: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[1].value);
+    (lane0, lane1)
+}
+
+/// Pack two raw `u64` lane arrays back into a packed NEON state.
+///
+/// Each raw value is wrapped into a Goldilocks field element (with
+/// reduction modulo P) and paired into a packed slot.
+///
+/// # Layout
+///
+/// ```text
+///     lane0[i], lane1[i]  ->  packed[i] = (Goldilocks(lane0[i]), Goldilocks(lane1[i]))
+/// ```
+#[inline]
+pub(super) fn pack_lanes<const WIDTH: usize>(
+    state: &mut [PackedGoldilocksNeon; WIDTH],
+    lane0: &[u64; WIDTH],
+    lane1: &[u64; WIDTH],
+) {
+    for i in 0..WIDTH {
+        // Wrap each raw u64 into a field element and pair them.
+        state[i] = PackedGoldilocksNeon([Goldilocks::new(lane0[i]), Goldilocks::new(lane1[i])]);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field::{PrimeCharacteristicRing, PrimeField64};
+    use proptest::prelude::*;
+
+    use super::*;
+
+    type F = Goldilocks;
+
+    /// Reduce a raw `u64` to its canonical Goldilocks representative.
+    fn canon(x: u64) -> u64 {
+        F::new(x).as_canonical_u64()
+    }
+
+    proptest! {
+        // ----------------------------------------------------------------
+        // Scalar field arithmetic
+        // ----------------------------------------------------------------
+
+        /// Verify ASM addition against field addition.
+        #[test]
+        fn test_add_asm(a: u64, b: u64) {
+            let expected = (F::new(a) + F::new(b)).as_canonical_u64();
+            let got = canon(unsafe { add_asm(a, b) });
+            prop_assert_eq!(got, expected);
+        }
+
+        /// Verify ASM multiplication against field multiplication.
+        #[test]
+        fn test_mul_asm(a: u64, b: u64) {
+            let expected = (F::new(a) * F::new(b)).as_canonical_u64();
+            let got = canon(unsafe { mul_asm(a, b) });
+            prop_assert_eq!(got, expected);
+        }
+
+        /// Verify ASM fused multiply-add against field multiply-add.
+        #[test]
+        fn test_mul_add_asm(a: u64, b: u64, c: u64) {
+            let expected = (F::new(a) * F::new(b) + F::new(c)).as_canonical_u64();
+            let got = canon(unsafe { mul_add_asm(a, b, c) });
+            prop_assert_eq!(got, expected);
+        }
+
+        // ----------------------------------------------------------------
+        // Unpack: packed state -> two raw u64 lane arrays
+        // ----------------------------------------------------------------
+
+        #[test]
+        fn test_unpack_lanes_w8(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Build a packed state from two independent lane arrays.
+            let packed: [PackedGoldilocksNeon; 8] =
+                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
+
+            // Unpack into raw u64 lane arrays.
+            let (got0, got1) = unpack_lanes(&packed);
+
+            // Each raw value must be the internal representation of the field element.
+            for i in 0..8 {
+                prop_assert_eq!(got0[i], F::new(lane_a[i]).value);
+                prop_assert_eq!(got1[i], F::new(lane_b[i]).value);
+            }
+        }
+
+        #[test]
+        fn test_unpack_lanes_w12(
+            lane_a in prop::array::uniform12(any::<u64>()),
+            lane_b in prop::array::uniform12(any::<u64>()),
+        ) {
+            // Same verification, width 12.
+            let packed: [PackedGoldilocksNeon; 12] =
+                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
+
+            let (got0, got1) = unpack_lanes(&packed);
+
+            for i in 0..12 {
+                prop_assert_eq!(got0[i], F::new(lane_a[i]).value);
+                prop_assert_eq!(got1[i], F::new(lane_b[i]).value);
+            }
+        }
+
+        // ----------------------------------------------------------------
+        // Pack: two raw u64 lane arrays -> packed state
+        // ----------------------------------------------------------------
+
+        #[test]
+        fn test_pack_lanes_w8(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Pack two raw lane arrays into packed state.
+            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
+            pack_lanes(&mut packed, &lane_a, &lane_b);
+
+            // Each packed element must hold the two corresponding field elements.
+            for i in 0..8 {
+                prop_assert_eq!(packed[i].0[0], F::new(lane_a[i]));
+                prop_assert_eq!(packed[i].0[1], F::new(lane_b[i]));
+            }
+        }
+
+        #[test]
+        fn test_pack_lanes_w12(
+            lane_a in prop::array::uniform12(any::<u64>()),
+            lane_b in prop::array::uniform12(any::<u64>()),
+        ) {
+            // Same verification, width 12.
+            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
+            pack_lanes(&mut packed, &lane_a, &lane_b);
+
+            for i in 0..12 {
+                prop_assert_eq!(packed[i].0[0], F::new(lane_a[i]));
+                prop_assert_eq!(packed[i].0[1], F::new(lane_b[i]));
+            }
+        }
+
+        // ----------------------------------------------------------------
+        // Roundtrip: pack then unpack recovers canonical values
+        // ----------------------------------------------------------------
+
+        #[test]
+        fn test_roundtrip_pack_unpack_w8(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Pack two lane arrays, then unpack them.
+            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
+            pack_lanes(&mut packed, &lane_a, &lane_b);
+            let (out0, out1) = unpack_lanes(&packed);
+
+            // The canonical form of the recovered values must match.
+            for i in 0..8 {
+                prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64());
+                prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64());
+            }
+        }
+
+        #[test]
+        fn test_roundtrip_pack_unpack_w12(
+            lane_a in prop::array::uniform12(any::<u64>()),
+            lane_b in prop::array::uniform12(any::<u64>()),
+        ) {
+            // Same roundtrip, width 12.
+            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
+            pack_lanes(&mut packed, &lane_a, &lane_b);
+            let (out0, out1) = unpack_lanes(&packed);
+
+            for i in 0..12 {
+                prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64());
+                prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64());
+            }
+        }
+
+        // ----------------------------------------------------------------
+        // Roundtrip: unpack then pack preserves packed state
+        // ----------------------------------------------------------------
+
+        #[test]
+        fn test_roundtrip_unpack_pack_w8(
+            lane_a in prop::array::uniform8(any::<u64>()),
+            lane_b in prop::array::uniform8(any::<u64>()),
+        ) {
+            // Start from a packed state.
+            let original: [PackedGoldilocksNeon; 8] =
+                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
+
+            // Unpack into raw lanes, then pack back.
+            let (raw0, raw1) = unpack_lanes(&original);
+            let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
+            pack_lanes(&mut restored, &raw0, &raw1);
+
+            // The restored packed state must equal the original.
+            for i in 0..8 {
+                prop_assert_eq!(restored[i].0[0], original[i].0[0]);
+                prop_assert_eq!(restored[i].0[1], original[i].0[1]);
+            }
+        }
+
+        #[test]
+        fn test_roundtrip_unpack_pack_w12(
+            lane_a in prop::array::uniform12(any::<u64>()),
+            lane_b in prop::array::uniform12(any::<u64>()),
+        ) {
+            // Same reverse roundtrip, width 12.
+            let original: [PackedGoldilocksNeon; 12] =
+                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
+
+            let (raw0, raw1) = unpack_lanes(&original);
+            let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
+            pack_lanes(&mut restored, &raw0, &raw1);
+
+            for i in 0..12 {
+                prop_assert_eq!(restored[i].0[0], original[i].0[0]);
+                prop_assert_eq!(restored[i].0[1], original[i].0[1]);
+            }
+        }
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs
new file mode 100644
index 000000000..5ac38a28b
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs
@@ -0,0 +1,217 @@
+use p3_field::extension::{
+    BinomiallyExtendable, BinomiallyExtendableAlgebra, HasTwoAdicBinomialExtension,
+};
+use p3_field::{PrimeCharacteristicRing, TwoAdicField, field_to_array};
+
+use crate::Goldilocks;
+
+impl BinomiallyExtendableAlgebra<Self, 2> for Goldilocks {}
+
+impl BinomiallyExtendable<2> for Goldilocks {
+    // Verifiable in Sage with
+    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
+    const W: Self = Self::new(7);
+
+    // DTH_ROOT = W^((p - 1)/2).
+    const DTH_ROOT: Self = Self::new(18446744069414584320);
+
+    const EXT_GENERATOR: [Self; 2] = [
+        Self::new(18081566051660590251),
+        Self::new(16121475356294670766),
+    ];
+}
+
+impl HasTwoAdicBinomialExtension<2> for Goldilocks {
+    const EXT_TWO_ADICITY: usize = 33;
+
+    fn ext_two_adic_generator(bits: usize) -> [Self; 2] {
+        assert!(bits <= 33);
+
+        if bits == 33 {
+            [Self::ZERO, Self::new(15659105665374529263)]
+        } else {
+            [Self::two_adic_generator(bits), Self::ZERO]
+        }
+    }
+}
+
+impl BinomiallyExtendableAlgebra<Self, 3> for Goldilocks {}
+
+impl BinomiallyExtendable<3> for Goldilocks {
+    // Verifiable in Sage with
+    // `R.<x> = GF(p)[]; assert (x^3 - 2).is_irreducible()`.
+    // Same irreducible as Lambda's Degree3GoldilocksExtensionField.
+    const W: Self = Self::new(2);
+
+    // DTH_ROOT = primitive 3rd root of unity = 7^((p-1)/3) mod p.
+    const DTH_ROOT: Self = Self::new(18446744065119617025);
+
+    // Generator of GF(p^3)* = 5 + w. Verified: passes order checks for
+    // all small prime factors of p^3 - 1.
+    const EXT_GENERATOR: [Self; 3] = [Self::new(5), Self::ONE, Self::ZERO];
+}
+
+impl HasTwoAdicBinomialExtension<3> for Goldilocks {
+    // v_2(p^3 - 1) = v_2(p-1) + v_2(p^2+p+1) = 32 + 0 = 32.
+    const EXT_TWO_ADICITY: usize = 32;
+
+    fn ext_two_adic_generator(bits: usize) -> [Self; 3] {
+        assert!(bits <= 32);
+        field_to_array(Self::two_adic_generator(bits))
+    }
+}
+
+impl BinomiallyExtendableAlgebra<Self, 5> for Goldilocks {}
+
+impl BinomiallyExtendable<5> for Goldilocks {
+    // Verifiable via:
+    //  ```sage
+    //  # Define Fp
+    //  p = 2**64 - 2**32 + 1
+    //  F = GF(p)
+
+    //  # Define Fp[z]
+    //  R.<z> = PolynomialRing(F)
+
+    //  # The polynomial x^5-3 is irreducible
+    //  assert(R(z^5-3).is_irreducible())
+    //  ```
+    const W: Self = Self::new(3);
+
+    // 5-th root = w^((p - 1)/5)
+    const DTH_ROOT: Self = Self::new(1041288259238279555);
+
+    // Generator of the extension field
+    // Obtained by finding the smallest Hamming weight vector
+    // with appropriate order, starting at [0,1,0,0,0]
+    const EXT_GENERATOR: [Self; 5] = [Self::TWO, Self::ONE, Self::ZERO, Self::ZERO, Self::ZERO];
+}
+
+impl HasTwoAdicBinomialExtension<5> for Goldilocks {
+    const EXT_TWO_ADICITY: usize = 32;
+
+    fn ext_two_adic_generator(bits: usize) -> [Self; 5] {
+        assert!(bits <= 32);
+
+        field_to_array(Self::two_adic_generator(bits))
+    }
+}
+
+#[cfg(test)]
+mod test_quadratic_extension {
+
+    use num_bigint::BigUint;
+    use p3_field::extension::BinomialExtensionField;
+    use p3_field::{ExtensionField, PrimeCharacteristicRing};
+    use p3_field_testing::{
+        test_extension_field, test_field, test_packed_extension_field,
+        test_two_adic_extension_field,
+    };
+
+    use crate::Goldilocks;
+
+    type F = Goldilocks;
+    type EF = BinomialExtensionField<F, 2>;
+
+    // There is a redundant representation of zero but we already tested it
+    // when testing the base field.
+    const ZEROS: [EF; 1] = [EF::ZERO];
+    const ONES: [EF; 1] = [EF::ONE];
+
+    // Get the prime factorization of the order of the multiplicative group.
+    // i.e. the prime factorization of P^2 - 1.
+    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 9] {
+        [
+            (BigUint::from(2u8), 33),
+            (BigUint::from(3u8), 1),
+            (BigUint::from(5u8), 1),
+            (BigUint::from(7u8), 1),
+            (BigUint::from(17u8), 1),
+            (BigUint::from(179u8), 1),
+            (BigUint::from(257u16), 1),
+            (BigUint::from(65537u32), 1),
+            (BigUint::from(7361031152998637u64), 1),
+        ]
+    }
+
+    test_field!(
+        super::EF,
+        &super::ZEROS,
+        &super::ONES,
+        &super::multiplicative_group_prime_factorization()
+    );
+
+    test_extension_field!(super::F, super::EF);
+    test_two_adic_extension_field!(super::F, super::EF);
+
+    type Pef = <EF as ExtensionField<F>>::ExtensionPacking;
+    const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO];
+    const PACKED_ONES: [Pef; 1] = [Pef::ONE];
+    test_packed_extension_field!(
+        super::F,
+        super::EF,
+        super::Pef,
+        &super::PACKED_ZEROS,
+        &super::PACKED_ONES
+    );
+}
+
+#[cfg(test)]
+mod test_quintic_extension {
+
+    use num_bigint::BigUint;
+    use p3_field::extension::BinomialExtensionField;
+    use p3_field::{ExtensionField, PrimeCharacteristicRing};
+    use p3_field_testing::{
+        test_extension_field, test_field, test_packed_extension_field,
+        test_two_adic_extension_field,
+    };
+
+    use crate::Goldilocks;
+
+    type F = Goldilocks;
+    type EF = BinomialExtensionField<F, 5>;
+
+    // There is a redundant representation of zero but we already tested it
+    // when testing the base field.
+    const ZEROS: [EF; 1] = [EF::ZERO];
+    const ONES: [EF; 1] = [EF::ONE];
+
+    // Get the prime factorization of the order of the multiplicative group.
+    // i.e. the prime factorization of P^5 - 1.
+    fn multiplicative_group_prime_factorization() -> [(num_bigint::BigUint, u32); 10] {
+        [
+            (BigUint::from(2u8), 32),
+            (BigUint::from(3u8), 1),
+            (BigUint::from(5u8), 2),
+            (BigUint::from(17u8), 1),
+            (BigUint::from(257u16), 1),
+            (BigUint::from(45971u16), 1),
+            (BigUint::from(65537u32), 1),
+            (BigUint::from(255006435240067831u64), 1),
+            (BigUint::from(280083648770327405561u128), 1),
+            (BigUint::from(7053197395277272939628824863222181u128), 1),
+        ]
+    }
+
+    test_field!(
+        super::EF,
+        &super::ZEROS,
+        &super::ONES,
+        &super::multiplicative_group_prime_factorization()
+    );
+
+    test_extension_field!(super::F, super::EF);
+    test_two_adic_extension_field!(super::F, super::EF);
+
+    type Pef = <EF as ExtensionField<F>>::ExtensionPacking;
+    const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO];
+    const PACKED_ONES: [Pef; 1] = [Pef::ONE];
+    test_packed_extension_field!(
+        super::F,
+        super::EF,
+        super::Pef,
+        &super::PACKED_ZEROS,
+        &super::PACKED_ONES
+    );
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs
new file mode 100644
index 000000000..ebe3f8c7a
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs
@@ -0,0 +1,813 @@
+use alloc::vec;
+use alloc::vec::Vec;
+use core::fmt::{Debug, Display, Formatter};
+use core::hash::{Hash, Hasher};
+use core::iter::{Product, Sum};
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+use core::{array, fmt};
+
+use num_bigint::BigUint;
+use p3_challenger::UniformSamplingField;
+use p3_field::exponentiation::exp_10540996611094048183;
+use p3_field::integers::QuotientMap;
+use p3_field::op_assign_macros::{
+    impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign,
+};
+use p3_field::{
+    Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField,
+    PrimeField64, RawDataSerializable, TwoAdicField, halve_u64, impl_raw_serializable_primefield64,
+    quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int,
+};
+use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner};
+use rand::Rng;
+use rand::distr::{Distribution, StandardUniform};
+use serde::{Deserialize, Serialize};
+
+/// The Goldilocks prime
+pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001;
+
+/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
+///
+/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64.
+#[derive(Copy, Clone, Default, Serialize, Deserialize)]
+#[repr(transparent)] // Important for reasoning about memory layout
+#[must_use]
+pub struct Goldilocks {
+    /// Not necessarily canonical.
+    pub(crate) value: u64,
+}
+
+impl Goldilocks {
+    /// Create a new field element from any `u64`.
+    ///
+    /// Any `u64` value is accepted. No reduction is performed since
+    /// Goldilocks uses a non-canonical internal representation.
+    #[inline]
+    pub const fn new(value: u64) -> Self {
+        Self { value }
+    }
+
+    /// Convert a `[u64; N]` array to an array of field elements.
+    ///
+    /// Const version of `input.map(Goldilocks::new)`.
+    #[inline]
+    pub const fn new_array<const N: usize>(input: [u64; N]) -> [Self; N] {
+        let mut output = [Self::ZERO; N];
+        let mut i = 0;
+        while i < N {
+            output[i].value = input[i];
+            i += 1;
+        }
+        output
+    }
+
+    /// Convert a `[[u64; N]; M]` array to a 2D array of field elements.
+    ///
+    /// Const version of `input.map(Goldilocks::new_array)`.
+    #[inline]
+    pub const fn new_2d_array<const N: usize, const M: usize>(
+        input: [[u64; N]; M],
+    ) -> [[Self; N]; M] {
+        let mut output = [[Self::ZERO; N]; M];
+        let mut i = 0;
+        while i < M {
+            output[i] = Self::new_array(input[i]);
+            i += 1;
+        }
+        output
+    }
+
+    /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`.
+    const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg();
+
+    /// A list of generators for the two-adic subgroups of the goldilocks field.
+    ///
+    /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`.
+    pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([
+        0x0000000000000001,
+        0xffffffff00000000,
+        0x0001000000000000,
+        0xfffffffeff000001,
+        0xefffffff00000001,
+        0x00003fffffffc000,
+        0x0000008000000000,
+        0xf80007ff08000001,
+        0xbf79143ce60ca966,
+        0x1905d02a5c411f4e,
+        0x9d8f2ad78bfed972,
+        0x0653b4801da1c8cf,
+        0xf2c35199959dfcb6,
+        0x1544ef2335d17997,
+        0xe0ee099310bba1e2,
+        0xf6b2cffe2306baac,
+        0x54df9630bf79450e,
+        0xabd0a6e8aa3d8a0e,
+        0x81281a7b05f9beac,
+        0xfbd41c6b8caa3302,
+        0x30ba2ecd5e93e76d,
+        0xf502aef532322654,
+        0x4b2a18ade67246b5,
+        0xea9d5a1336fbc98b,
+        0x86cdcc31c307e171,
+        0x4bbaf5976ecfefd8,
+        0xed41d05b78d6e286,
+        0x10d78dd8915a171d,
+        0x59049500004a4485,
+        0xdfa8c93ba46d2666,
+        0x7e9bd009b86a0845,
+        0x400a7f755588e659,
+        0x185629dcda58878c,
+    ]);
+
+    /// A list of powers of two from 0 to 95.
+    ///
+    /// Note that 2^{96} = -1 mod P so all powers of two can be simply
+    /// derived from this list.
+    const POWERS_OF_TWO: [Self; 96] = {
+        let mut powers_of_two = [Self::ONE; 96];
+
+        let mut i = 1;
+        while i < 64 {
+            powers_of_two[i] = Self::new(1 << i);
+            i += 1;
+        }
+        let mut var = Self::new(1 << 63);
+        while i < 96 {
+            var = const_add(var, var);
+            powers_of_two[i] = var;
+            i += 1;
+        }
+        powers_of_two
+    };
+}
+
+impl PartialEq for Goldilocks {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_canonical_u64() == other.as_canonical_u64()
+    }
+}
+
+impl Eq for Goldilocks {}
+
+impl Packable for Goldilocks {}
+
+impl Hash for Goldilocks {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        state.write_u64(self.as_canonical_u64());
+    }
+}
+
+impl Ord for Goldilocks {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.as_canonical_u64().cmp(&other.as_canonical_u64())
+    }
+}
+
+impl PartialOrd for Goldilocks {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Display for Goldilocks {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        Display::fmt(&self.as_canonical_u64(), f)
+    }
+}
+
+impl Debug for Goldilocks {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        Debug::fmt(&self.as_canonical_u64(), f)
+    }
+}
+
+impl Distribution<Goldilocks> for StandardUniform {
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Goldilocks {
+        loop {
+            let next_u64 = rng.next_u64();
+            let is_canonical = next_u64 < Goldilocks::ORDER_U64;
+            if is_canonical {
+                return Goldilocks::new(next_u64);
+            }
+        }
+    }
+}
+
+impl UniformSamplingField for Goldilocks {
+    const MAX_SINGLE_SAMPLE_BITS: usize = 24;
+    const SAMPLING_BITS_M: [u64; 64] = {
+        let prime: u64 = P;
+        let mut a = [0u64; 64];
+        let mut k = 0;
+        while k < 64 {
+            if k == 0 {
+                a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always.
+            } else {
+                // Create a mask to zero out the last k bits
+                let mask = !((1u64 << k) - 1);
+                a[k] = prime & mask;
+            }
+            k += 1;
+        }
+        a
+    };
+}
+
+impl PrimeCharacteristicRing for Goldilocks {
+    type PrimeSubfield = Self;
+
+    const ZERO: Self = Self::new(0);
+    const ONE: Self = Self::new(1);
+    const TWO: Self = Self::new(2);
+    const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f
+    }
+
+    #[inline]
+    fn from_bool(b: bool) -> Self {
+        Self::new(b.into())
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::new(halve_u64::<P>(self.value))
+    }
+
+    #[inline]
+    fn mul_2exp_u64(&self, exp: u64) -> Self {
+        // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P.
+        if exp < 96 {
+            *self * Self::POWERS_OF_TWO[exp as usize]
+        } else if exp < 192 {
+            -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
+        } else {
+            self.mul_2exp_u64(exp % 192)
+        }
+    }
+
+    #[inline]
+    fn div_2exp_u64(&self, mut exp: u64) -> Self {
+        // In the goldilocks field, 2^192 = 1 mod P.
+        // Thus 2^{-n} = 2^{192 - n} mod P.
+        exp %= 192;
+        self.mul_2exp_u64(192 - exp)
+    }
+
+    #[inline]
+    fn sum_array<const N: usize>(input: &[Self]) -> Self {
+        assert_eq!(N, input.len());
+        // Benchmarking shows that for N <= 3 it's faster to sum the elements directly
+        // but for N > 3 it's faster to use the .sum() methods which passes through u128's
+        // allowing for delayed reductions.
+        match N {
+            0 => Self::ZERO,
+            1 => input[0],
+            2 => input[0] + input[1],
+            3 => input[0] + input[1] + input[2],
+            _ => input.iter().copied().sum(),
+        }
+    }
+
+    #[inline]
+    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
+        // The constant OFFSET has 2 important properties:
+        // 1. It is a multiple of P.
+        // 2. It is greater than the maximum possible value of the sum of the products of two u64s.
+        const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
+        assert!((N as u32) <= (1 << 31));
+        match N {
+            0 => Self::ZERO,
+            1 => lhs[0] * rhs[0],
+            2 => {
+                // We unroll the N = 2 case as it is slightly faster and this is an important case
+                // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension.
+                let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
+                let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
+
+                // We know that long_prod_0, long_prod_1 < OFFSET.
+                // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET.
+                let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
+                // Compiler really likes defining sum_corr here instead of in the if/else.
+                let sum_corr = sum.wrapping_sub(OFFSET);
+                if over {
+                    reduce128(sum_corr)
+                } else {
+                    reduce128(sum)
+                }
+            }
+            _ => {
+                let (lo_plus_hi, hi) = lhs
+                    .iter()
+                    .zip(rhs)
+                    .map(|(x, y)| (x.value as u128) * (y.value as u128))
+                    .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
+                        // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits.
+                        let val_hi = (val >> 96) as u64;
+                        // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi.
+                        // As N <= 2^32, acc_hi cannot overflow.
+                        unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
+                    });
+                // First, remove the hi part from lo_plus_hi.
+                let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
+                // As 2^{96} = -1 mod P, we simply need to reduce lo - hi.
+                // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow.
+                let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
+                reduce128(sum)
+            }
+        }
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY:
+        // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment
+        // and memory layout making `flatten_to_base` safe. This this will create
+        // a vector Goldilocks elements with value set to 0.
+        unsafe { flatten_to_base(vec![0u64; len]) }
+    }
+}
+
+/// Degree of the smallest permutation polynomial for Goldilocks.
+///
+/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
+impl InjectiveMonomial<7> for Goldilocks {}
+
+impl PermutationMonomial<7> for Goldilocks {
+    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
+    ///
+    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl RawDataSerializable for Goldilocks {
+    impl_raw_serializable_primefield64!();
+}
+
+impl Field for Goldilocks {
+    #[cfg(all(
+        target_arch = "x86_64",
+        target_feature = "avx2",
+        not(target_feature = "avx512f")
+    ))]
+    type Packing = crate::PackedGoldilocksAVX2;
+
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+    type Packing = crate::PackedGoldilocksAVX512;
+
+    // PATCHED for bench_vs_plonky3: disable NEON packing for apples-to-apples
+    // scalar comparison against Lambda STARK. Upstream: `crate::PackedGoldilocksNeon`.
+    #[cfg(target_arch = "aarch64")]
+    type Packing = Self;
+
+    #[cfg(not(any(
+        all(
+            target_arch = "x86_64",
+            target_feature = "avx2",
+            not(target_feature = "avx512f")
+        ),
+        all(target_arch = "x86_64", target_feature = "avx512f"),
+        target_arch = "aarch64",
+    )))]
+    type Packing = Self;
+
+    // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator()
+    const GENERATOR: Self = Self::new(7);
+
+    fn is_zero(&self) -> bool {
+        self.value == 0 || self.value == Self::ORDER_U64
+    }
+
+    fn try_inverse(&self) -> Option<Self> {
+        if self.is_zero() {
+            return None;
+        }
+
+        Some(gcd_inversion(*self))
+    }
+
+    #[inline]
+    fn order() -> BigUint {
+        P.into()
+    }
+}
+
+// We use macros to implement QuotientMap<Int> for all integer types except for u64 and i64.
+quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
+quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
+quotient_map_large_uint!(
+    Goldilocks,
+    u64,
+    Goldilocks::ORDER_U64,
+    "`[0, 2^64 - 2^32]`",
+    "`[0, 2^64 - 1]`",
+    [u128]
+);
+quotient_map_large_iint!(
+    Goldilocks,
+    i64,
+    "`[-(2^63 - 2^31), 2^63 - 2^31]`",
+    "`[1 + 2^32 - 2^64, 2^64 - 1]`",
+    [(i128, u128)]
+);
+
+impl QuotientMap<u64> for Goldilocks {
+    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
+    ///
+    /// No reduction is needed as the internal value is allowed
+    /// to be any u64.
+    #[inline]
+    fn from_int(int: u64) -> Self {
+        Self::new(int)
+    }
+
+    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
+    ///
+    /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`.
+    #[inline]
+    fn from_canonical_checked(int: u64) -> Option<Self> {
+        (int < Self::ORDER_U64).then(|| Self::new(int))
+    }
+
+    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
+    ///
+    /// # Safety
+    /// In this case this function is actually always safe as the internal
+    /// value is allowed to be any u64.
+    #[inline(always)]
+    unsafe fn from_canonical_unchecked(int: u64) -> Self {
+        Self::new(int)
+    }
+}
+
+impl QuotientMap<i64> for Goldilocks {
+    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
+    ///
+    /// We simply need to deal with the sign.
+    #[inline]
+    fn from_int(int: i64) -> Self {
+        if int >= 0 {
+            Self::new(int as u64)
+        } else {
+            Self::new(Self::ORDER_U64.wrapping_add_signed(int))
+        }
+    }
+
+    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
+    ///
+    /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`.
+    #[inline]
+    fn from_canonical_checked(int: i64) -> Option<Self> {
+        const POS_BOUND: i64 = (P >> 1) as i64;
+        const NEG_BOUND: i64 = -POS_BOUND;
+        match int {
+            0..=POS_BOUND => Some(Self::new(int as u64)),
+            NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
+            _ => None,
+        }
+    }
+
+    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
+    ///
+    /// # Safety
+    /// In this case this function is actually always safe as the internal
+    /// value is allowed to be any u64.
+    #[inline(always)]
+    unsafe fn from_canonical_unchecked(int: i64) -> Self {
+        Self::from_int(int)
+    }
+}
+
+impl PrimeField for Goldilocks {
+    fn as_canonical_biguint(&self) -> BigUint {
+        self.as_canonical_u64().into()
+    }
+}
+
+impl PrimeField64 for Goldilocks {
+    const ORDER_U64: u64 = P;
+
+    #[inline]
+    fn as_canonical_u64(&self) -> u64 {
+        let mut c = self.value;
+        // We only need one condition subtraction, since 2 * ORDER would not fit in a u64.
+        if c >= Self::ORDER_U64 {
+            c -= Self::ORDER_U64;
+        }
+        c
+    }
+}
+
+impl TwoAdicField for Goldilocks {
+    const TWO_ADICITY: usize = 32;
+
+    fn two_adic_generator(bits: usize) -> Self {
+        assert!(bits <= Self::TWO_ADICITY);
+        Self::TWO_ADIC_GENERATORS[bits]
+    }
+}
+
+/// A const version of the addition function.
+///
+/// Useful for constructing constants values in const contexts. Outside of
+/// const contexts, Add should be used instead.
+#[inline]
+const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
+    let (sum, over) = lhs.value.overflowing_add(rhs.value);
+    let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
+    if over {
+        sum += Goldilocks::NEG_ORDER;
+    }
+    Goldilocks::new(sum)
+}
+
+impl Add for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        let (sum, over) = self.value.overflowing_add(rhs.value);
+        let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
+        if over {
+            // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not
+            // sufficient for double-overflow.
+            // This assume does two things:
+            //  1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip
+            //     this check.
+            //  2. Hints to the compiler how rare this double-overflow is (thus handled better with
+            //     a branch).
+            unsafe {
+                assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
+            }
+            branch_hint();
+            sum += Self::NEG_ORDER; // Cannot overflow.
+        }
+        Self::new(sum)
+    }
+}
+
+impl Sub for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        let (diff, under) = self.value.overflowing_sub(rhs.value);
+        let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
+        if under {
+            // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not
+            // sufficient for double-underflow.
+            // This assume does two things:
+            //  1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER,
+            //     then it can skip this check.
+            //  2. Hints to the compiler how rare this double-underflow is (thus handled better
+            //     with a branch).
+            unsafe {
+                assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
+            }
+            branch_hint();
+            diff -= Self::NEG_ORDER; // Cannot underflow.
+        }
+        Self::new(diff)
+    }
+}
+
+impl Neg for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn neg(self) -> Self::Output {
+        Self::new(Self::ORDER_U64 - self.as_canonical_u64())
+    }
+}
+
+impl Mul for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        reduce128(u128::from(self.value) * u128::from(rhs.value))
+    }
+}
+
+impl_add_assign!(Goldilocks);
+impl_sub_assign!(Goldilocks);
+impl_mul_methods!(Goldilocks);
+impl_div_methods!(Goldilocks, Goldilocks);
+
+impl Sum for Goldilocks {
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2.
+
+        // This sum will not overflow so long as iter.len() < 2^64.
+        let sum = iter.map(|x| x.value as u128).sum::<u128>();
+        reduce128(sum)
+    }
+}
+
+/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
+/// field order and `2^64`.
+#[inline]
+pub(crate) fn reduce128(x: u128) -> Goldilocks {
+    let (x_lo, x_hi) = split(x); // This is a no-op
+    let x_hi_hi = x_hi >> 32;
+    let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
+
+    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
+    if borrow {
+        branch_hint(); // A borrow is exceedingly rare. It is faster to branch.
+        t0 -= Goldilocks::NEG_ORDER; // Cannot underflow.
+    }
+    let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
+    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
+    Goldilocks::new(t2)
+}
+
+#[inline]
+#[allow(clippy::cast_possible_truncation)]
+const fn split(x: u128) -> (u64, u64) {
+    (x as u64, (x >> 64) as u64)
+}
+
+/// Fast addition modulo ORDER for x86-64.
+/// This function is marked unsafe for the following reasons:
+///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
+///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
+///     the registers, so its use is not recommended when either input will be used again.
+#[inline(always)]
+#[cfg(target_arch = "x86_64")]
+unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+    unsafe {
+        let res_wrapped: u64;
+        let adjustment: u64;
+        core::arch::asm!(
+            "add {0}, {1}",
+            // Trick. The carry flag is set iff the addition overflowed.
+            // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
+            // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
+            // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
+            // on overflow; this happens be NEG_ORDER.
+            // Note that the CPU does not realize that the result of sbb x, x does not actually depend
+            // on x. We must write the result to a register that we know to be ready. We have a
+            // dependency on {1} anyway, so let's use it.
+            "sbb {1:e}, {1:e}",
+            inlateout(reg) x => res_wrapped,
+            inlateout(reg) y => adjustment,
+            options(pure, nomem, nostack),
+        );
+        assume(x != 0 || (res_wrapped == y && adjustment == 0));
+        assume(y != 0 || (res_wrapped == x && adjustment == 0));
+        // Add NEG_ORDER == subtract ORDER.
+        // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
+        res_wrapped + adjustment
+    }
+}
+
+#[inline(always)]
+#[cfg(not(target_arch = "x86_64"))]
+unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+    let (res_wrapped, carry) = x.overflowing_add(y);
+    // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
+    res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
+}
+
+/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm.
+///
+/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf.
+/// The key idea is to compute update factors which are incorrect by a known power of 2 which
+/// can be corrected at the end. These update factors can then be used to construct the inverse
+/// via a simple linear combination.
+///
+/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic.
+fn gcd_inversion(input: Goldilocks) -> Goldilocks {
+    // Initialise our values to the value we want to invert and the prime.
+    let (mut a, mut b) = (input.value, P);
+
+    // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`.
+    // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`.
+    // We split the iterations into 2 rounds of length 63.
+    const ROUND_SIZE: usize = 63;
+
+    // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted
+    // version which doesn't do any computations involving g. But either the compiler works this out
+    // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks.
+    let (f00, _, f10, _) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
+    let (_, _, f11, g11) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
+
+    // The update factors are i64's except we need to interpret -2^63 as 2^63.
+    // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
+    let u = from_unusual_int(f00);
+    let v = from_unusual_int(f10);
+    let u_fac11 = from_unusual_int(f11);
+    let v_fac11 = from_unusual_int(g11);
+
+    // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}.
+    // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66.
+    (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
+}
+
+/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63.
+const fn from_unusual_int(int: i64) -> Goldilocks {
+    if (int >= 0) || (int == i64::MIN) {
+        Goldilocks::new(int as u64)
+    } else {
+        Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field::extension::BinomialExtensionField;
+    use p3_field_testing::{
+        test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field,
+    };
+
+    use super::*;
+
+    type F = Goldilocks;
+    type EF = BinomialExtensionField<F, 5>;
+
+    #[test]
+    fn test_goldilocks() {
+        let f = F::new(100);
+        assert_eq!(f.as_canonical_u64(), 100);
+
+        // Over the Goldilocks field, the following set of equations hold
+        // p               = 0
+        // 2^64 - 2^32 + 1 = 0
+        // 2^64            = 2^32 - 1
+        let f = F::new(u64::MAX);
+        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
+
+        let f = F::from_u64(u64::MAX);
+        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
+
+        // Generator check
+        let expected_multiplicative_group_generator = F::new(7);
+        assert_eq!(F::GENERATOR, expected_multiplicative_group_generator);
+        assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64);
+
+        // Check on `reduce_u128`
+        let x = u128::MAX;
+        let y = reduce128(x);
+        // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1
+        // 2^128 - 1 = (2^64 - 1) * (2^64 + 1)
+        //           = (2^32 - 1 - 1) * (2^32 - 1 + 1)
+        //           = (2^32 - 2) * (2^32)
+        //           = 2^64 - 2 * 2^32
+        //           = 2^64 - 2^33
+        //           = 2^32 - 1 - 2^33
+        //           = - 2^32 - 1
+        let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE;
+        assert_eq!(y, expected_result);
+
+        let f = F::new(100);
+        assert_eq!(f.injective_exp_n().injective_exp_root_n(), f);
+        assert_eq!(y.injective_exp_n().injective_exp_root_n(), y);
+        assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO);
+    }
+
+    // Goldilocks has a redundant representation for both 0 and 1.
+    const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)];
+    const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)];
+
+    // Get the prime factorization of the order of the multiplicative group.
+    // i.e. the prime factorization of P - 1.
+    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] {
+        [
+            (BigUint::from(2u8), 32),
+            (BigUint::from(3u8), 1),
+            (BigUint::from(5u8), 1),
+            (BigUint::from(17u8), 1),
+            (BigUint::from(257u16), 1),
+            (BigUint::from(65537u32), 1),
+        ]
+    }
+
+    test_field!(
+        crate::Goldilocks,
+        &super::ZEROS,
+        &super::ONES,
+        &super::multiplicative_group_prime_factorization()
+    );
+    test_prime_field!(crate::Goldilocks);
+    test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES);
+    test_two_adic_field!(crate::Goldilocks);
+
+    test_field_dft!(
+        radix2dit,
+        crate::Goldilocks,
+        super::EF,
+        p3_dft::Radix2Dit<_>
+    );
+    test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers);
+    test_field_dft!(
+        parallel,
+        crate::Goldilocks,
+        super::EF,
+        p3_dft::Radix2DitParallel<crate::Goldilocks>
+    );
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs
new file mode 100644
index 000000000..9447fe094
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs
@@ -0,0 +1,42 @@
+//! The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
+
+#![no_std]
+
+extern crate alloc;
+
+mod extension;
+mod goldilocks;
+mod mds;
+mod poseidon2;
+
+pub use goldilocks::*;
+pub use mds::*;
+pub use poseidon2::*;
+
+pub mod poseidon1;
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64_neon;
+
+#[cfg(target_arch = "aarch64")]
+pub use aarch64_neon::*;
+
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "avx2",
+    not(target_feature = "avx512f")
+))]
+mod x86_64_avx2;
+
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "avx2",
+    not(target_feature = "avx512f")
+))]
+pub use x86_64_avx2::*;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+mod x86_64_avx512;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub use x86_64_avx512::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs
new file mode 100644
index 000000000..df41485b3
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs
@@ -0,0 +1,761 @@
+//! MDS matrices over the Goldilocks field, and permutations defined by them.
+//!
+//! NB: Not all sizes have fast implementations of their permutations.
+//! Supported sizes: 8, 12, 16, 24, 32, 64, 68.
+//! Sizes 8 and 12 are from Plonky2, size 16 was found as part of concurrent
+//! work by Angus Gruen and Hamish Ivey-Law. Other sizes are from Ulrich Haböck's
+//! database.
+
+use p3_dft::Radix2Bowers;
+use p3_mds::MdsPermutation;
+use p3_mds::karatsuba_convolution::Convolve;
+use p3_mds::util::{apply_circulant, apply_circulant_fft, first_row_to_first_col};
+use p3_symmetric::Permutation;
+
+use crate::{Goldilocks, reduce128};
+
+#[derive(Clone, Debug, Default)]
+pub struct MdsMatrixGoldilocks;
+
+/// Instantiate convolution for "small" RHS vectors over Goldilocks.
+///
+/// Here "small" means N = len(rhs) <= 16 and sum(r for r in rhs) <
+/// 2^51, though in practice the sum will be less than 2^9.
+#[derive(Debug)]
+pub struct SmallConvolveGoldilocks;
+impl Convolve<Goldilocks, i128, i64> for SmallConvolveGoldilocks {
+    const T_ZERO: i128 = 0;
+    const U_ZERO: i64 = 0;
+
+    #[inline(always)]
+    fn halve(val: i128) -> i128 {
+        val >> 1
+    }
+
+    /// Return the lift of a Goldilocks element, 0 <= input.value <= P
+    /// < 2^64. We widen immediately, since some valid Goldilocks elements
+    /// don't fit in an i64, and since in any case overflow can occur
+    /// for even the smallest convolutions.
+    #[inline(always)]
+    fn read(input: Goldilocks) -> i128 {
+        input.value as i128
+    }
+
+    /// For a convolution of size N, |x| < N * 2^64 and (as per the
+    /// assumption above), |y| < 2^51. So the product is at most N *
+    /// 2^115 which will not overflow for N <= 16. We widen `y` at
+    /// this point to perform the multiplication.
+    #[inline(always)]
+    fn parity_dot<const N: usize>(u: [i128; N], v: [i64; N]) -> i128 {
+        let mut s = 0i128;
+        for i in 0..N {
+            s += u[i] * v[i] as i128;
+        }
+        s
+    }
+
+    /// The assumptions above mean z < N^2 * 2^115, which is at most
+    /// 2^123 when N <= 16.
+    ///
+    /// NB: Even though intermediate values could be negative, the
+    /// output must be non-negative since the inputs were
+    /// non-negative.
+    #[inline(always)]
+    fn reduce(z: i128) -> Goldilocks {
+        debug_assert!(z >= 0);
+        reduce128(z as u128)
+    }
+}
+
+const FFT_ALGO: Radix2Bowers = Radix2Bowers;
+
+pub(crate) const MATRIX_CIRC_MDS_8_SML_ROW: [i64; 8] = [7, 1, 3, 8, 8, 3, 4, 9];
+
+/// First column of the circulant MDS matrix for width 8, derived from the first row.
+pub const MATRIX_CIRC_MDS_8_COL: [i64; 8] = first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW);
+
+impl Permutation<[Goldilocks; 8]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [Goldilocks; 8]) -> [Goldilocks; 8] {
+        const MATRIX_CIRC_MDS_8_SML_COL: [i64; 8] =
+            first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW);
+        SmallConvolveGoldilocks::apply(
+            input,
+            MATRIX_CIRC_MDS_8_SML_COL,
+            SmallConvolveGoldilocks::conv8,
+        )
+    }
+}
+impl MdsPermutation<Goldilocks, 8> for MdsMatrixGoldilocks {}
+
+pub(crate) const MATRIX_CIRC_MDS_12_SML_ROW: [i64; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10];
+
+/// First column of the circulant MDS matrix for width 12, derived from the first row.
+pub const MATRIX_CIRC_MDS_12_COL: [i64; 12] = first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW);
+
+impl Permutation<[Goldilocks; 12]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [Goldilocks; 12]) -> [Goldilocks; 12] {
+        const MATRIX_CIRC_MDS_12_SML_COL: [i64; 12] =
+            first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW);
+        SmallConvolveGoldilocks::apply(
+            input,
+            MATRIX_CIRC_MDS_12_SML_COL,
+            SmallConvolveGoldilocks::conv12,
+        )
+    }
+}
+impl MdsPermutation<Goldilocks, 12> for MdsMatrixGoldilocks {}
+
+pub(crate) const MATRIX_CIRC_MDS_16_SML_ROW: [i64; 16] =
+    [1, 1, 51, 1, 11, 17, 2, 1, 101, 63, 15, 2, 67, 22, 13, 3];
+
+impl Permutation<[Goldilocks; 16]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [Goldilocks; 16]) -> [Goldilocks; 16] {
+        const MATRIX_CIRC_MDS_16_SML_COL: [i64; 16] =
+            first_row_to_first_col(&MATRIX_CIRC_MDS_16_SML_ROW);
+        SmallConvolveGoldilocks::apply(
+            input,
+            MATRIX_CIRC_MDS_16_SML_COL,
+            SmallConvolveGoldilocks::conv16,
+        )
+    }
+}
+impl MdsPermutation<Goldilocks, 16> for MdsMatrixGoldilocks {}
+
+#[rustfmt::skip]
+pub(crate) const MATRIX_CIRC_MDS_24_GOLDILOCKS: [u64; 24] = [
+    0x5FFFFFFFA00AAAAB, 0x24021AB75BBFE656, 0x7BE9082D73B06DF5, 0x2282863E9C3A5A62,
+    0xE0071C70DFFC71C8, 0x796CB65AB42A1A63, 0xDBBBBFFADFFDDDE3, 0x23B88EE217C5C9C2,
+    0x20030C309FFB6DB7, 0x23C3C64763BE1E1D, 0x0F93B7C9CC51362E, 0xC697A1094BD0850A,
+    0xDFFFFFFF1FFC71C8, 0xC15A4FD614950302, 0xC41D883A4C4DEDF2, 0x187879BC23C46462,
+    0x5FFCF3CEDFFE79E8, 0x1C41DF105B82398E, 0x64444003DFFDDDDA, 0x76EDDBB6F7E51F95,
+    0x1FF8E38E20038E39, 0x214139BD5C40A09D, 0x3065B7CCF3B3B621, 0x23B6F4622485CEDC,
+];
+
+impl Permutation<[Goldilocks; 24]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [Goldilocks; 24]) -> [Goldilocks; 24] {
+        apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input)
+    }
+}
+impl MdsPermutation<Goldilocks, 24> for MdsMatrixGoldilocks {}
+
+#[rustfmt::skip]
+const MATRIX_CIRC_MDS_32_GOLDILOCKS: [u64; 32] = [
+    0x0800000000000000, 0x69249248B4924925, 0x3ABD5EAF15EAF57B, 0x294A5294739CE73A,
+    0x59E2D2CEB4B3C5A6, 0x087FBE00FF7C0220, 0xA554AA94A554AA96, 0xF00080FEFFDF8005,
+    0x64CCCCCC6666699A, 0x5B13AD8973B139D9, 0xAD4A55ACA54AD5AA, 0xDA496DA3B492DB8A,
+    0x4AD696955A5694B5, 0xA4A6B29A25B496D3, 0xA74EA162162BD3A9, 0xC698B3A5662CE98C,
+    0xA7FFFFFF55555556, 0x4AAAAAAA5AAAAAAB, 0xB047DC113DC11F71, 0x8BA2E8B99B26C9B3,
+    0xD259696C5A5B4D2E, 0xA7D540AA557EA9F6, 0x8B6E922D26DB249C, 0xFAAA805455602AAD,
+    0xCB33333266666334, 0xD13B17619B13B277, 0x45B26D9326E9374A, 0x52AB552A5AA9556B,
+    0x68ED2D2DB4B87697, 0x8B264C98A74E9D3B, 0x09EC23D83D847B09, 0x2C9A4D26669349A5,
+];
+
+impl Permutation<[Goldilocks; 32]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [Goldilocks; 32]) -> [Goldilocks; 32] {
+        const ENTRIES: [u64; 32] = first_row_to_first_col(&MATRIX_CIRC_MDS_32_GOLDILOCKS);
+        apply_circulant_fft(&FFT_ALGO, ENTRIES, &input)
+    }
+}
+impl MdsPermutation<Goldilocks, 32> for MdsMatrixGoldilocks {}
+
+#[rustfmt::skip]
+const MATRIX_CIRC_MDS_64_GOLDILOCKS: [u64; 64] = [
+    0x07FFFFFFFC000000, 0xFBFFFFFF04000001, 0x436DB6DB25B6DB6E, 0x4AAAAAAA5AAAAAAB,
+    0x45B2D96C6D96CB66, 0x3BC7BC7B87BC7BC8, 0x6318C63125294A53, 0xCB3672CCCD9CB368,
+    0xB43CB5A12D68796C, 0xFBFBFBFAFBFBFBFD, 0x883DBF107B7E2210, 0x8A7689B59B629DA3,
+    0xF7FEFFDF00000001, 0x7B7C83BBC83BC47C, 0xEFF0410107EF7F83, 0x2CD8B3629CB272CA,
+    0x9800019900CCCE67, 0xFBFFFBFF07FFFC01, 0x94EC4A758C4EC628, 0xDA5A5B4A6D2D2E1F,
+    0xFFEFC080FC003FFF, 0xBC387BC2C783BC79, 0xB492DB686D24B6F3, 0x1DB6925B4B6E2477,
+    0x7801E0EF87BFFF10, 0xFC0803FAFBFC0409, 0x3780FE03C086F21C, 0x8B749B224DB22D94,
+    0x32648B36B76E9923, 0x3BC3C3C387C3C3C4, 0x79AF286B4FCA1AF3, 0x9E2762758B627628,
+    0x52AAAAAA56AAAAAB, 0xFBFFFFFEFC000001, 0xF7FFFFFF08000001, 0x2CCCCCCC9CCCCCCD,
+    0xCF286BC946BCA1B0, 0xBC483B7B883B7C49, 0xD9364D9287C1F07D, 0xAD5A94A8A95AD5AA,
+    0xFF871002C400F1E1, 0xFC03FC02FC03FC05, 0xD29495A4D6D4B4A6, 0x6C926DD1DD24DB65,
+    0x1EDC247B4DB64937, 0x7C7B843B47BC437D, 0xA55A95AAAD5AD52C, 0x4A96D5A45AD694A6,
+    0xFE6664CBCD999801, 0xFC0003FF08000401, 0x1EC4F09D64EC4D8A, 0x9E1E1D2C8B4B4A5B,
+    0xD9270937709B64DC, 0x3BB77C4448843B78, 0xFFFFFFDF03FF0021, 0x59D8761D2D8A6299,
+    0xC3496878A5E5A4B5, 0xFBF80402FC0403F9, 0x5ECD9B360E142851, 0x6D925D6429D64976,
+    0xA8AE615C19CC2B99, 0xBC44444388444445, 0xDFE3F1F81CFC7E40, 0xDA4924916D24924A,
+];
+
+impl Permutation<[Goldilocks; 64]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [Goldilocks; 64]) -> [Goldilocks; 64] {
+        const ENTRIES: [u64; 64] = first_row_to_first_col(&MATRIX_CIRC_MDS_64_GOLDILOCKS);
+        apply_circulant_fft(&FFT_ALGO, ENTRIES, &input)
+    }
+}
+impl MdsPermutation<Goldilocks, 64> for MdsMatrixGoldilocks {}
+
+#[rustfmt::skip]
+const MATRIX_CIRC_MDS_68_GOLDILOCKS: [u64; 68] = [
+    0x03C3C3C3FC3C3C3C, 0x6799AFC54A69BC7D, 0xDA8C2C496A74B03B, 0x1E641D7AB35ED229,
+    0x9239DA20DA3A2686, 0x6E23D41459EBA8C4, 0x7BC412896E2A6B3A, 0x9082059089ABD4FC,
+    0x94A16FA8B0339EEE, 0x85650EC91BB519C9, 0x1600745267E94DE1, 0xFFFD8405C82020AB,
+    0x21BDE80429DCED6A, 0x8ACE123AF754E343, 0xFFC7211605D2BDAE, 0xC21187AE15900F4D,
+    0x9C4A889708568DC6, 0x65A5A726B5758D8E, 0x949DB90B9AC0D11A, 0x23B6CF7C368BBE52,
+    0xD5128DDF59CB5A35, 0xF53BCC5BDADF3A0A, 0xBA7C5112F4BAB1CD, 0x4B93989C5B729351,
+    0x6534B7E50E4AD1CB, 0x640061B54C918405, 0x0E66E1F90D2C9311, 0x31C8649B0FE7557F,
+    0x0E9190D165F4A8F3, 0x52DF336BB708F919, 0x3C0F6697F14065A5, 0xBE8190942EC50031,
+    0x60038E9ACC701118, 0x73F105909A55A88B, 0xFEBEBEBDABEBEBED, 0x6F52163A64B03467,
+    0xFBAE131F23A12F56, 0x1950493BC70D0676, 0x2886550DB5A1BBBF, 0x15B003D6E58181D7,
+    0x3A4E7D9D44F100F8, 0x6CC3AB896025E6A0, 0x7E23E68456F825E5, 0x079CDD570B591A16,
+    0xEC15A830C3D2CCD1, 0xCF4C722D2C0F8A0E, 0xC1BB6F5591B59A26, 0xB63A5931A607BDE0,
+    0x43A0AD0B71040187, 0x7E4B492889D1CEE0, 0x734153F3F0C31C5B, 0x98D8D756B2725A5B,
+    0x5589D20D74BA00B8, 0xB2DF58DF0A312509, 0xFABC378690D64A3A, 0x700640AFC244B695,
+    0xFFA652236547F3BE, 0x2B9CA498A001D059, 0x7DACA6F16787D5DE, 0xAAAD774FAC613EA3,
+    0xA88583816975CD56, 0x78B71DC516FF49CA, 0xC7BF095DF702FFA6, 0x78A60B3F971783B3,
+    0xCB158EF40BC75CAC, 0xA97E818DBC152B4C, 0x9FC8339D415C3999, 0x006A88C0A0D8201C,
+];
+
+impl Permutation<[Goldilocks; 68]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [Goldilocks; 68]) -> [Goldilocks; 68] {
+        apply_circulant(&MATRIX_CIRC_MDS_68_GOLDILOCKS, &input)
+    }
+}
+impl MdsPermutation<Goldilocks, 68> for MdsMatrixGoldilocks {}
+
+#[cfg(test)]
+mod tests {
+    use p3_symmetric::Permutation;
+
+    use super::{Goldilocks, MdsMatrixGoldilocks};
+
+    #[test]
+    fn goldilocks8() {
+        let input: [Goldilocks; 8] = Goldilocks::new_array([
+            2434589605738284713,
+            4817685620989478889,
+            13397079175138649456,
+            11944520631108649751,
+            1033251468644039632,
+            3092099742268329866,
+            7160548811622790454,
+            9959569614427134344,
+        ]);
+
+        let output = MdsMatrixGoldilocks.permute(input);
+
+        let expected: [Goldilocks; 8] = Goldilocks::new_array([
+            16726687146516531007,
+            14721040752765534861,
+            15566838577475948790,
+            9095485010737904250,
+            11353934351835864222,
+            11056556168691087893,
+            4199602889124860181,
+            315643510993921470,
+        ]);
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn goldilocks12() {
+        let input: [Goldilocks; 12] = Goldilocks::new_array([
+            14847187883725400244,
+            969392934980971521,
+            6996647758016470432,
+            4674844440624672154,
+            264841656685969785,
+            1246852265697711623,
+            18223868478428473484,
+            12122736699239070772,
+            11263701854732819430,
+            12739925508864285577,
+            11648637570857932167,
+            14090978315217600393,
+        ]);
+
+        let output = MdsMatrixGoldilocks.permute(input);
+
+        let expected: [Goldilocks; 12] = Goldilocks::new_array([
+            9322351889214742299,
+            8700136572060418355,
+            4881757876459003977,
+            9899544690241851021,
+            480548822895830465,
+            5445915149371405525,
+            14955363277757168581,
+            6672733082273363313,
+            190938676320003294,
+            1613225933948270736,
+            3549006224849989171,
+            12169032187873197425,
+        ]);
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn goldilocks16() {
+        let input: [Goldilocks; 16] = Goldilocks::new_array([
+            13216135600341032847,
+            15626390207663319651,
+            2052474569300149934,
+            4375663431730581786,
+            16596827905941257435,
+            10019626608444427271,
+            7831946179065963230,
+            17104499871144693506,
+            9021930732511690478,
+            6899419210615882449,
+            8131182521761419514,
+            432489675596019804,
+            8508050013409958723,
+            14134506582804571789,
+            13283546413390931641,
+            14711125975653831032,
+        ]);
+
+        let output = MdsMatrixGoldilocks.permute(input);
+
+        let expected: [Goldilocks; 16] = Goldilocks::new_array([
+            9484392671298797780,
+            149770626972189150,
+            12125722600598304117,
+            15945232149672903756,
+            13199929870021500593,
+            18443980893262804946,
+            317150800081307627,
+            16910019239751125049,
+            1996802739033818490,
+            11668458913264624237,
+            11078800762167869397,
+            13758408662406282356,
+            11119677412113674380,
+            7344117715971661026,
+            4202436890275702092,
+            681166793519210465,
+        ]);
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn goldilocks24() {
+        let input: [Goldilocks; 24] = Goldilocks::new_array([
+            11426771245122339662,
+            5975488243963332229,
+            11441424994503305651,
+            5755561333702259678,
+            7295454168648181339,
+            16724279929816174064,
+            32359231037136391,
+            3713621595270370753,
+            8421765959140936778,
+            12370571593326246544,
+            8633733294559731287,
+            12765436832373161027,
+            15606692828890413034,
+            8068160018166226874,
+            10719661629577139538,
+            13036735610140127982,
+            10213543772818211674,
+            8041886705706266368,
+            12022983417703446028,
+            4179370708601587579,
+            11125302089484330465,
+            9904943018174649533,
+            16178194376951442671,
+            1545799842160818502,
+        ]);
+
+        let output = MdsMatrixGoldilocks.permute(input);
+
+        let expected: [Goldilocks; 24] = Goldilocks::new_array([
+            18431075688485197060,
+            14823984346528185622,
+            7262979358411339215,
+            14816911393874702213,
+            6721523710303409972,
+            10829861327716364029,
+            2456948878733883601,
+            11088379938350287658,
+            3820735023521527858,
+            9062288923770492958,
+            5159244568306327366,
+            1401669669887165869,
+            11908734248351870182,
+            10640195377186320543,
+            6552733980894593378,
+            17103376282032495459,
+            5204287788603805758,
+            17783185518697631139,
+            9006863878586007300,
+            11122535637762904803,
+            5271621316102699962,
+            9734499541452484536,
+            11778274360927642637,
+            3217831681350496533,
+        ]);
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn goldilocks32() {
+        let input: [Goldilocks; 32] = Goldilocks::new_array([
+            8401806579759049284,
+            14709608922272986544,
+            8130995604641968478,
+            7833133203357642391,
+            10700492548100684406,
+            3941105252506602047,
+            8122370916776133262,
+            15079919378435648206,
+            8774521769784086994,
+            16794844316583392853,
+            9356562741425567167,
+            13317198313361936216,
+            7187680218428599522,
+            16525662096158660997,
+            540453741156061014,
+            16543585577270698663,
+            3802215918136285729,
+            11389297895303247764,
+            5133769394766075512,
+            1057795099426170863,
+            18037861421172314665,
+            17632255188776359310,
+            17616515088477043142,
+            13307921676744533876,
+            17602277262015191215,
+            15819040654617566738,
+            11961318546000835928,
+            15593174310433874065,
+            9152657050882549004,
+            4801868480369948110,
+            13202076339494141066,
+            726396847460932316,
+        ]);
+
+        let output = MdsMatrixGoldilocks.permute(input);
+
+        let expected: [Goldilocks; 32] = Goldilocks::new_array([
+            1179701925859507209,
+            5543239597787055637,
+            5978278622530964070,
+            3622388166841103287,
+            11383243182536830899,
+            14719109850604985734,
+            17672601866826623850,
+            4879627080283827596,
+            7556887460241466109,
+            9548493506061808122,
+            13980851986825291174,
+            2029844508485082398,
+            10375517623784134775,
+            13067093881736606569,
+            6446569064196467795,
+            15375603814779462714,
+            11307946648742033371,
+            1593906954637160608,
+            5776169226282316678,
+            8167048017892669861,
+            3954052226208277367,
+            9346878497567392707,
+            5570872870988220142,
+            10792661164389799960,
+            17494962593174487938,
+            7080549557843445752,
+            14059834522311268132,
+            17747288366997773235,
+            17158122400620315305,
+            6816598002359267850,
+            12363049840026116993,
+            13313901185845854868,
+        ]);
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn goldilocks64() {
+        let input: [Goldilocks; 64] = Goldilocks::new_array([
+            3471075506106776899,
+            4817046918282259009,
+            3480368692354016145,
+            18110937755057600106,
+            3130862083451221140,
+            15376650156021437015,
+            7997596749112997445,
+            7742916918728590149,
+            421644639408377358,
+            2491271421424548020,
+            1940196613872160755,
+            7152053147988203177,
+            13697425352450853423,
+            15877844788345672674,
+            17787098720906653510,
+            6857627524724866519,
+            8541180216786820396,
+            10769715704553877654,
+            9265712399189924160,
+            10220120296438955872,
+            18201417281995610945,
+            6749698931189855822,
+            13700000989116811950,
+            13205437213697578097,
+            10514342943989454609,
+            9926015350795325725,
+            2289808224483690257,
+            12598806357998460973,
+            14393945610969324307,
+            4744625557965362093,
+            2270701163031951561,
+            2927942398784334090,
+            5250916386894733430,
+            4030189910566345872,
+            4953663590324639075,
+            1241519685782896035,
+            8681312160951359069,
+            8236353015475387411,
+            4972690458759871996,
+            1396852754187463352,
+            17512022752774329733,
+            14009268822557836700,
+            1346736409027879377,
+            7609463340861239931,
+            10701512803758419515,
+            5067199073587389986,
+            5030018986055211116,
+            17692625804700013551,
+            9992938630604785132,
+            15350127009762647067,
+            10247405821493235386,
+            15172888833500531069,
+            14657693742399622179,
+            7391511805216089127,
+            2035742693690795598,
+            4047216012963057952,
+            12602085105939403203,
+            16985723692990258059,
+            12141021186082151434,
+            3174646196626212833,
+            16484520987666295947,
+            10579720164460442970,
+            9596917135039689219,
+            13761818390665814258,
+        ]);
+
+        let output = MdsMatrixGoldilocks.permute(input);
+
+        let expected: [Goldilocks; 64] = Goldilocks::new_array([
+            9158798369861934356,
+            9224859686427886689,
+            16948559910286211274,
+            15765762765140902574,
+            16202509467561200764,
+            1911749439284071529,
+            4607026757869726805,
+            8473827004973131317,
+            13716800466551879373,
+            6670177022201597800,
+            17416833238376299449,
+            14953676562252669578,
+            5828107070718286209,
+            17980287408679531241,
+            2220583438808757820,
+            14564318040622847100,
+            3950519594558514416,
+            12164610170526828198,
+            457385640833960098,
+            14068973922383216628,
+            9614382247226943793,
+            3932756878771319222,
+            12728498054939249570,
+            9435109056498897661,
+            7283114805836756402,
+            1720178259138435097,
+            11496602000538177285,
+            7736206812858942065,
+            14289784438950643645,
+            12052665489155550962,
+            12918409840610303255,
+            5224324424989208352,
+            7826309014606327907,
+            11657314889847733528,
+            13899641072303006348,
+            7501780959676548477,
+            1064261716045449147,
+            1487682458939665452,
+            10894217148983862136,
+            12785338167343566981,
+            8043323074629160032,
+            10852328074701301213,
+            15029722608724150267,
+            2611937278660861263,
+            13995790409949796943,
+            7103138700054564899,
+            12756778219044204581,
+            4147399997707606088,
+            11930966590061754579,
+            16708700985380478903,
+            2370160521342035603,
+            14893791582608133454,
+            15313288276425450946,
+            16224601303711716386,
+            4488931442519177087,
+            7443169181907410918,
+            12381442753785370161,
+            16366345507676500076,
+            8097905256807642731,
+            8504207502183388457,
+            11400931328719780407,
+            10879211614969476303,
+            7265889003783205111,
+            7322738272300165489,
+        ]);
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn goldilocks68() {
+        let input: [Goldilocks; 68] = Goldilocks::new_array([
+            16450563043143968653,
+            3688080826640678185,
+            133253417037384537,
+            17501558583799613353,
+            14920674569425704293,
+            5030578721963251055,
+            9795600398273758687,
+            402012644192671817,
+            10657312189068414445,
+            9508835336085746575,
+            16081669758721272608,
+            2072823794278273547,
+            16831381326702573736,
+            11381683312293543190,
+            5679539322738625588,
+            9346499485038639332,
+            15554202803455984983,
+            18373955571490331663,
+            11323895584334729789,
+            16834542679468148445,
+            14751528164286075953,
+            3755158780970327991,
+            12622814707645103582,
+            10329238611694882547,
+            7642766530280843057,
+            4876120096290984742,
+            412912224820604426,
+            9118233770240274553,
+            3626520971021993076,
+            10841049054903806738,
+            18205546599950141835,
+            7198482606375262809,
+            17183313930831625294,
+            10181033256431249241,
+            1061211413812819905,
+            3980261141891682525,
+            5674176959446948353,
+            6062696542969845681,
+            3383081006315025715,
+            8812665902421024067,
+            3093645099818246186,
+            16178737149039707082,
+            8204245222345541411,
+            11072582337937050490,
+            17969785901925882398,
+            4670890092981706609,
+            12537558683977529426,
+            12084598516323376868,
+            16293685096019175644,
+            10117612240421467846,
+            17873102395739074620,
+            11220493906741851877,
+            4632957003022201019,
+            12934229307704669322,
+            2152792796882257594,
+            12521131928134126701,
+            17472006670677761650,
+            4560570065837283016,
+            6315543803073912887,
+            4098689719955359793,
+            1784883877365258237,
+            6837590090927294950,
+            2391417016765166652,
+            16389291664603960875,
+            12285946887702044436,
+            7231705445010258971,
+            12976071926225281356,
+            8829402645443096358,
+        ]);
+
+        let output = MdsMatrixGoldilocks.permute(input);
+
+        let expected: [Goldilocks; 68] = Goldilocks::new_array([
+            4984914285749049383,
+            10397959071664799177,
+            3331616814639908945,
+            4252459885611162121,
+            5517786723806029201,
+            1826620401370703815,
+            8257849352373689773,
+            1722805960790112693,
+            17654983138917187833,
+            7542660006721409612,
+            1970182718241277021,
+            12865815507550811641,
+            17507096607056552658,
+            7988714902687660369,
+            150082662759625574,
+            17329095993317360383,
+            965880604543562997,
+            2820931239306841741,
+            1980667983336380501,
+            3781794112174728826,
+            7323192150179872391,
+            12243426826276589932,
+            315076483410634889,
+            3221894784246078707,
+            3515955216509190252,
+            964376148920419876,
+            7679719864273407732,
+            2516714701741920303,
+            4837221266652621366,
+            15301563603415983061,
+            10380321314559647625,
+            3023678426639670063,
+            12020917879204725519,
+            10595808165609787680,
+            14199186729378048831,
+            4520610719509879248,
+            9983949546821718635,
+            5066092593424854949,
+            13843503196305181790,
+            14296362815835302652,
+            6766348697864530153,
+            13804582129741554661,
+            8032169955336281598,
+            5198513488794721460,
+            10613667919514788349,
+            7948289550930596506,
+            14118391408956101449,
+            4356952068887595371,
+            709878153008378134,
+            17168579964784489802,
+            17840495726541494819,
+            2710471020841761312,
+            9950159372116756450,
+            3909574932971200058,
+            2430964021804554670,
+            6035162446515244642,
+            14656543530572478095,
+            1539013407173403800,
+            4150113154618904744,
+            4904646199269229662,
+            17257014030727492672,
+            3791823431764085889,
+            13680668409434600948,
+            12367427987617118934,
+            12462908457168650050,
+            10891613749697412017,
+            6867760775372053830,
+            12474954319307005079,
+        ]);
+
+        assert_eq!(output, expected);
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs
new file mode 100644
index 000000000..89da79e45
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs
@@ -0,0 +1,1143 @@
+//! Poseidon1 permutation for Goldilocks.
+//!
+//! # Overview
+//!
+//! This module provides the Poseidon1 hash permutation instantiated for the
+//! Goldilocks field (p = 2^64 - 2^32 + 1). The public API is a single type
+//! alias that transparently dispatches to the best available implementation.
+//!
+//! # Platform Dispatch
+//!
+//! On **aarch64**, the type alias resolves to a dual-dispatch wrapper:
+//! scalar permutations use NEON-accelerated MDS for full rounds with
+//! LLVM-optimized sparse partial rounds, while packed NEON permutations
+//! use the fused dual-lane ASM path (w8) or per-lane scalar path (w12).
+//!
+//! On **all other platforms**, it resolves to the generic Poseidon1
+//! implementation with Karatsuba MDS convolution.
+//!
+//! No `#[cfg]` is needed in calling code.
+//!
+//! # MDS Matrix
+//!
+//! The MDS matrix is a **circulant** matrix sourced from the MDS crate.
+//! At runtime, it is applied via fast Karatsuba convolution (sub-O(t^2)).
+//! During initialization only, it is expanded to dense form for the
+//! sparse matrix decomposition of partial rounds.
+//!
+//! # Round Constants
+//!
+//! Generated by the Grain LFSR (Poseidon1 paper, Appendix E) with SBOX=0 (x^alpha encoding).
+
+use p3_poseidon1::{
+    Poseidon1, Poseidon1Constants, Poseidon1ExternalLayerGeneric, Poseidon1InternalLayerGeneric,
+};
+
+use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL};
+use crate::{Goldilocks, MdsMatrixGoldilocks};
+
+/// S-box degree for Goldilocks Poseidon1.
+///
+/// The S-box raises each element to this power. The Goldilocks prime
+/// factors as `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. Neither 3 nor 5
+/// are coprime to `p - 1`, so the smallest valid exponent is 7.
+pub const GOLDILOCKS_S_BOX_DEGREE: u64 = 7;
+
+/// Number of full rounds per half for Goldilocks Poseidon (`RF / 2`).
+///
+/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending).
+/// Follows the Poseidon paper's security analysis (Section 5.4) with a +2 RF margin.
+pub const GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS: usize = 4;
+
+/// Number of partial rounds for Goldilocks Poseidon (width 8).
+///
+/// Derived from the interpolation bound in the Poseidon paper (Eq. 3):
+///
+///   R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5
+///            = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20
+///
+/// With the +7.5% security margin (Section 5.4): ⌈1.075 × 20⌉ = 22.
+pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8: usize = 22;
+
+/// Number of partial rounds for Goldilocks Poseidon (width 12).
+///
+/// Same interpolation bound as width 8:
+///
+///   R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20
+///
+/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
+pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12: usize = 22;
+
+/// Generic (non-fused) Poseidon1 permutation for Goldilocks.
+///
+/// Uses the platform-independent Poseidon1 implementation with Karatsuba
+/// MDS convolution. Used directly for widths not supported by the fused
+/// type (e.g. 16, 24) and as the non-aarch64 fallback for widths 8 and 12.
+pub type Poseidon1GoldilocksGeneric<const WIDTH: usize> = Poseidon1<
+    Goldilocks,
+    Poseidon1ExternalLayerGeneric<Goldilocks, MdsMatrixGoldilocks, WIDTH>,
+    Poseidon1InternalLayerGeneric<Goldilocks, WIDTH>,
+    WIDTH,
+    GOLDILOCKS_S_BOX_DEGREE,
+>;
+
+/// Unified Poseidon1 permutation for Goldilocks.
+///
+/// On aarch64, resolves to a dual-dispatch wrapper: scalar permutations
+/// use NEON MDS for full rounds with sparse partial rounds, packed NEON
+/// permutations use fused dual-lane ASM (w8) or per-lane scalar (w12).
+///
+/// On all other platforms, resolves to the generic implementation with
+/// Karatsuba MDS convolution.
+///
+/// Supports both scalar and packed state representations transparently.
+#[cfg(target_arch = "aarch64")]
+pub type Poseidon1Goldilocks<const WIDTH: usize> = crate::Poseidon1GoldilocksDispatch<WIDTH>;
+
+/// Unified Poseidon1 permutation for Goldilocks.
+///
+/// On aarch64, resolves to the fused ASM-optimized implementation that
+/// uses inline assembly and dual-lane NEON processing.
+///
+/// On all other platforms, resolves to the generic implementation with
+/// Karatsuba MDS convolution.
+///
+/// Supports both scalar and packed state representations transparently.
+#[cfg(not(target_arch = "aarch64"))]
+pub type Poseidon1Goldilocks<const WIDTH: usize> = Poseidon1GoldilocksGeneric<WIDTH>;
+
+/// Round constants for width-8 Poseidon1 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
+///
+/// Generated by `poseidon/generate_constants.py --field goldilocks --width 8`.
+///
+/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)].
+pub const GOLDILOCKS_POSEIDON1_RC_8: [[Goldilocks; 8]; 30] = Goldilocks::new_2d_array([
+    // Initial full rounds (4)
+    [
+        0xdd5743e7f2a5a5d9,
+        0xcb3a864e58ada44b,
+        0xffa2449ed32f8cdc,
+        0x42025f65d6bd13ee,
+        0x7889175e25506323,
+        0x34b98bb03d24b737,
+        0xbdcc535ecc4faa2a,
+        0x5b20ad869fc0d033,
+    ],
+    [
+        0xf1dda5b9259dfcb4,
+        0x27515210be112d59,
+        0x4227d1718c766c3f,
+        0x26d333161a5bd794,
+        0x49b938957bf4b026,
+        0x4a56b5938b213669,
+        0x1120426b48c8353d,
+        0x6b323c3f10a56cad,
+    ],
+    [
+        0xce57d6245ddca6b2,
+        0xb1fc8d402bba1eb1,
+        0xb5c5096ca959bd04,
+        0x6db55cd306d31f7f,
+        0xc49d293a81cb9641,
+        0x1ce55a4fe979719f,
+        0xa92e60a9d178a4d1,
+        0x002cc64973bcfd8c,
+    ],
+    [
+        0xcea721cce82fb11b,
+        0xe5b55eb8098ece81,
+        0x4e30525c6f1ddd66,
+        0x43c6702827070987,
+        0xaca68430a7b5762a,
+        0x3674238634df9c93,
+        0x88cee1c825e33433,
+        0xde99ae8d74b57176,
+    ],
+    // Partial rounds (22)
+    [
+        0x488897d85ff51f56,
+        0x1140737ccb162218,
+        0xa7eeb9215866ed35,
+        0x9bd2976fee49fcc9,
+        0xc0c8f0de580a3fcc,
+        0x4fb2dae6ee8fc793,
+        0x343a89f35f37395b,
+        0x223b525a77ca72c8,
+    ],
+    [
+        0x56ccb62574aaa918,
+        0xc4d507d8027af9ed,
+        0xa080673cf0b7e95c,
+        0xf0184884eb70dcf8,
+        0x044f10b0cb3d5c69,
+        0xe9e3f7993938f186,
+        0x1b761c80e772f459,
+        0x606cec607a1b5fac,
+    ],
+    [
+        0x14a0c2e1d45f03cd,
+        0x4eace8855398574f,
+        0xf905ca7103eff3e6,
+        0xf8c8f8d20862c059,
+        0xb524fe8bdd678e5a,
+        0xfbb7865901a1ec41,
+        0x014ef1197d341346,
+        0x9725e20825d07394,
+    ],
+    [
+        0xfdb25aef2c5bae3b,
+        0xbe5402dc598c971e,
+        0x93a5711f04cdca3d,
+        0xc45a9a5b2f8fb97b,
+        0xfe8946a924933545,
+        0x2af997a27369091c,
+        0xaa62c88e0b294011,
+        0x058eb9d810ce9f74,
+    ],
+    [
+        0xb3cb23eced349ae4,
+        0xa3648177a77b4a84,
+        0x43153d905992d95d,
+        0xf4e2a97cda44aa4b,
+        0x5baa2702b908682f,
+        0x082923bdf4f750d1,
+        0x98ae09a325893803,
+        0xf8a6475077968838,
+    ],
+    [
+        0xceb0735bf00b2c5f,
+        0x0a1a5d953888e072,
+        0x2fcb190489f94475,
+        0xb5be06270dec69fc,
+        0x739cb934b09acf8b,
+        0x537750b75ec7f25b,
+        0xe9dd318bae1f3961,
+        0xf7462137299efe1a,
+    ],
+    [
+        0xb1f6b8eee9adb940,
+        0xbdebcc8a809dfe6b,
+        0x40fc1f791b178113,
+        0x3ac1c3362d014864,
+        0x9a016184bdb8aeba,
+        0x95f2394459fbc25e,
+        0xe3f34a07a76a66c2,
+        0x8df25f9ad98b1b96,
+    ],
+    [
+        0x85ffc27171439d9d,
+        0xddcb9a2dcfd26910,
+        0x26b5ba4bf3afb94e,
+        0xffff9cc7c7651e2f,
+        0x8c88364698280b55,
+        0xebc114167b910501,
+        0x2d77b4d89ecfb516,
+        0x332e0828eba151f2,
+    ],
+    [
+        0x46fa6a6450dd4735,
+        0xd00db7dd92384a33,
+        0x5fd4fb751f3a5fc5,
+        0x496fb90c0bb65ea2,
+        0xf3baec0bb87cc5c7,
+        0x862a3c0a7d4c7713,
+        0xbf5f38336a3f47d8,
+        0x41ad9dbc1394a20c,
+    ],
+    [
+        0xcc535945b7dbf0f7,
+        0x82af2bc93685bcec,
+        0x8e4c8d0c8cebfccd,
+        0x17cb39417e84597e,
+        0xd4a965a8c749b232,
+        0xa2cab040f33f3ee5,
+        0xa98811a1fed4e3a6,
+        0x1cc48b54f377e2a1,
+    ],
+    [
+        0xe40cd4f6c5609a27,
+        0x11de79ebca97a4a4,
+        0x9177c73d8b7e929d,
+        0x2a6fe8085797e792,
+        0x3de6e93329f8d5ae,
+        0x3f7af9125da962ff,
+        0xd710682cfc77d3ac,
+        0x48faf05f3b053cf4,
+    ],
+    [
+        0x287db8630da89c8b,
+        0x4d0de32053cb30e9,
+        0x8b37a4f20c5ada7b,
+        0xe7cc6ebe78c84ecf,
+        0x240bdc0a66a2610d,
+        0x8299e7f02caa1650,
+        0x380a53fefb6e754e,
+        0x684a1d8cf8eb6810,
+    ],
+    [
+        0xe839452eb4b8a5e1,
+        0xb03fa62e90626af4,
+        0x11a688602fbc5efc,
+        0x30dda75c355a2d62,
+        0x0f712adcb73810de,
+        0xffdc1102187f1ae1,
+        0x40c34f398254b99c,
+        0xede021b9dc289a4a,
+    ],
+    [
+        0x8b7b05225c4e7dad,
+        0x3bc794346f9d9ff9,
+        0xfccb5a57f2ca86ff,
+        0xbb1502015a7da9d4,
+        0xd7e0a35d4352a015,
+        0x27af7a44f8160931,
+        0xc37442f6782f4615,
+        0xbdf392a9bd095dcb,
+    ],
+    [
+        0xc17f55037cf00de9,
+        0xbcffedd34c71a874,
+        0x5eb45d2a8133d1f2,
+        0xbabe251e1612ebdf,
+        0x3efeb9fbe438c536,
+        0x2d7cef97b4afe1cf,
+        0xe5de1b4660016c0b,
+        0xcdcc26c332f5657c,
+    ],
+    [
+        0xe01dd653daf15809,
+        0xb0a6bdd4b41094b5,
+        0x27eac858b0b03a05,
+        0x51d43b5e93adbdc0,
+        0x8b89a23b0fea5fc9,
+        0xdc8ac3b14f7f2fc1,
+        0xe793f82f1efec039,
+        0x9f6f2cf8969e7b80,
+    ],
+    [
+        0x49d45382e0f21d4a,
+        0x5f4ad1797cd72786,
+        0x4dc3dbebfd45f795,
+        0x03a3ef84dba6e1bc,
+        0x204bc9b3d3fc4c01,
+        0x9ad706081e89b9ba,
+        0x638bfb4d840e9f89,
+        0x5ef2938cd095ae35,
+    ],
+    [
+        0x42cca18ebeb265c8,
+        0xb7b2ec5c29aecbf8,
+        0x0d84f9535dc78f0f,
+        0x04e64ad942e77b8c,
+        0xb4880dffffc9da0b,
+        0x16db16d9c29adeb1,
+        0x09bbaf2a0590cd1e,
+        0x76460e74961fcf8d,
+    ],
+    [
+        0xed12a2276dfa1553,
+        0x0b5acec5de0436fd,
+        0x3c6cfea033a1f0a8,
+        0x2b5ecefe546cac15,
+        0x6e2d82884cd3bf6f,
+        0xc134878d1add7b83,
+        0x997963422eb7a280,
+        0x5e834537ac648cf6,
+    ],
+    [
+        0x89e779214737c0b7,
+        0x1a8c05e8581ad95b,
+        0x8d18b72796437cf7,
+        0xe7252c949e04b106,
+        0x53267c4fd174585a,
+        0xa16ef5d9c81dad47,
+        0xda65191937270a46,
+        0xcb2a5b55f2df664c,
+    ],
+    [
+        0x854aee2dc1924137,
+        0xf37013c9d479ece6,
+        0x0e163bc0630c4696,
+        0x384ee64955048f76,
+        0xf65d814e28ee4ec5,
+        0xe57bc564fd82f1b1,
+        0x4b338937b6876614,
+        0x66ee0b04ed43cd8d,
+    ],
+    [
+        0x49884bf25f4ef15d,
+        0xeb51fe28de1c6f54,
+        0x2cd64e84fce8dfcc,
+        0x29164a96a541a013,
+        0x173ce7558f4cacb8,
+        0xeb5b1ce5877c89e9,
+        0x5faff4b0f5217bf6,
+        0xac42d0b1c20f205e,
+    ],
+    // Terminal full rounds (4)
+    [
+        0xfb1d6bf0ca43221b,
+        0x97b0a1b01d6a2955,
+        0x08c60bd622952b30,
+        0x43f2be0f9e24147c,
+        0xfa7268b7d3730f5d,
+        0x43a6c419a23983bb,
+        0xcd77c1f7b29b113c,
+        0xcfa43c9db8eec29f,
+    ],
+    [
+        0xcaaa95a6c7365dec,
+        0x0a91193f798f3be0,
+        0x1104497652735dc6,
+        0x35aecb93663b515e,
+        0x8dbc9916065aa858,
+        0xada8f7a0266579ed,
+        0x524dee7bec1ea789,
+        0xa93aee9dd5af9521,
+    ],
+    [
+        0x9d1f1b54750d707e,
+        0x7c9feab87096d5dc,
+        0xa2e1fb19f9d4261b,
+        0xb714deb448de6346,
+        0x225d1f0d011c5403,
+        0x1549b7f1d28cedc0,
+        0xaef3e46f97d43942,
+        0x6dfc7ffe0b38bf08,
+    ],
+    [
+        0x7de853fdc542b663,
+        0xa68ecc96610657b2,
+        0xe88bb5428af289b1,
+        0xd7cfa1504c5569f5,
+        0x78a9aad0d642d30a,
+        0xd68315f2353dce52,
+        0x46e56300f86fcfd5,
+        0x323d95332b145fd6,
+    ],
+]);
+
+/// Round constants for width-12 Poseidon1 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
+///
+/// Generated by `poseidon/generate_constants.py --field goldilocks --width 12`.
+///
+/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)].
+pub const GOLDILOCKS_POSEIDON1_RC_12: [[Goldilocks; 12]; 30] = Goldilocks::new_2d_array([
+    // Initial full rounds (4)
+    [
+        0x13dcf33aba214f46,
+        0x30b3b654a1da6d83,
+        0x1fc634ada6159b56,
+        0x937459964dc03466,
+        0xedd2ef2ca7949924,
+        0xede9affde0e22f68,
+        0x8515b9d6bac9282d,
+        0x6b5c07b4e9e900d8,
+        0x1ec66368838c8a08,
+        0x9042367d80d1fbab,
+        0x400283564a3c3799,
+        0x4a00be0466bca75e,
+    ],
+    [
+        0x7913beee58e3817f,
+        0xf545e88532237d90,
+        0x22f8cb8736042005,
+        0x6f04990e247a2623,
+        0xfe22e87ba37c38cd,
+        0xd20e32c85ffe2815,
+        0x117227674048fe73,
+        0x4e9fb7ea98a6b145,
+        0xe0866c232b8af08b,
+        0x00bbc77916884964,
+        0x7031c0fb990d7116,
+        0x240a9e87cf35108f,
+    ],
+    [
+        0x2e6363a5a12244b3,
+        0x5e1c3787d1b5011c,
+        0x4132660e2a196e8b,
+        0x3a013b648d3d4327,
+        0xf79839f49888ea43,
+        0xfe85658ebafe1439,
+        0xb6889825a14240bd,
+        0x578453605541382b,
+        0x4508cda8f6b63ce9,
+        0x9c3ef35848684c91,
+        0x0812bde23c87178c,
+        0xfe49638f7f722c14,
+    ],
+    [
+        0x8e3f688ce885cbf5,
+        0xb8e110acf746a87d,
+        0xb4b2e8973a6dabef,
+        0x9e714c5da3d462ec,
+        0x6438f9033d3d0c15,
+        0x24312f7cf1a27199,
+        0x23f843bb47acbf71,
+        0x9183f11a34be9f01,
+        0x839062fbb9d45dbf,
+        0x24b56e7e6c2e43fa,
+        0xe1683da61c962a72,
+        0xa95c63971a19bfa7,
+    ],
+    // Partial rounds (22)
+    [
+        0x4adf842aa75d4316,
+        0xf8fbb871aa4ab4eb,
+        0x68e85b6eb2dd6aeb,
+        0x07a0b06b2d270380,
+        0xd94e0228bd282de4,
+        0x8bdd91d3250c5278,
+        0x209c68b88bba778f,
+        0xb5e18cdab77f3877,
+        0xb296a3e808da93fa,
+        0x8370ecbda11a327e,
+        0x3f9075283775dad8,
+        0xb78095bb23c6aa84,
+    ],
+    [
+        0x3f36b9fe72ad4e5f,
+        0x69bc96780b10b553,
+        0x3f1d341f2eb7b881,
+        0x4e939e9815838818,
+        0xda366b3ae2a31604,
+        0xbc89db1e7287d509,
+        0x6102f411f9ef5659,
+        0x58725c5e7ac1f0ab,
+        0x0df5856c798883e7,
+        0xf7bb62a8da4c961b,
+        0xc68be7c94882a24d,
+        0xaf996d5d5cdaedd9,
+    ],
+    [
+        0x9717f025e7daf6a5,
+        0x6436679e6e7216f4,
+        0x8a223d99047af267,
+        0xbb512e35a133ba9a,
+        0xfbbf44097671aa03,
+        0xf04058ebf6811e61,
+        0x5cca84703fac7ffb,
+        0x9b55c7945de6469f,
+        0x8e05bf09808e934f,
+        0x2ea900de876307d7,
+        0x7748fff2b38dfb89,
+        0x6b99a676dd3b5d81,
+    ],
+    [
+        0xac4bb7c627cf7c13,
+        0xadb6ebe5e9e2f5ba,
+        0x2d33378cafa24ae3,
+        0x1e5b73807543f8c2,
+        0x09208814bfebb10f,
+        0x782e64b6bb5b93dd,
+        0xadd5a48eac90b50f,
+        0xadd4c54c736ea4b1,
+        0xd58dbb86ed817fd8,
+        0x6d5ed1a533f34ddd,
+        0x28686aa3e36b7cb9,
+        0x591abd3476689f36,
+    ],
+    [
+        0x047d766678f13875,
+        0xa2a11112625f5b49,
+        0x21fd10a3f8304958,
+        0xf9b40711443b0280,
+        0xd2697eb8b2bde88e,
+        0x3493790b51731b3f,
+        0x11caf9dd73764023,
+        0x7acfb8f72878164e,
+        0x744ec4db23cefc26,
+        0x1e00e58f422c6340,
+        0x21dd28d906a62dda,
+        0xf32a46ab5f465b5f,
+    ],
+    [
+        0xbfce13201f3f7e6b,
+        0xf30d2e7adb5304e2,
+        0xecdf4ee4abad48e9,
+        0xf94e82182d395019,
+        0x4ee52e3744d887c5,
+        0xa1341c7cac0083b2,
+        0x2302fb26c30c834a,
+        0xaea3c587273bf7d3,
+        0xf798e24961823ec7,
+        0x962deba3e9a2cd94,
+        0xb36ee79485ca4707,
+        0xd380199eddd2de52,
+    ],
+    [
+        0x70971fc4e6f85305,
+        0x8e722f6e5dc32699,
+        0xa0883df133052b92,
+        0x8f86c6a3eb7d01a4,
+        0x763649c8b670bdc5,
+        0x830d5c82b808759b,
+        0xaa1da8bb91da02e7,
+        0x9bc9bf629e211c4d,
+        0x0f0a899b10a4dea8,
+        0xb883bdcee7c6b356,
+        0x78c7101e7496ae1e,
+        0x2fd6c5a8bf1e5ca6,
+    ],
+    [
+        0xe2a6e06e61fcec9c,
+        0xebfce7d5c5b3dbd5,
+        0xca2eeca4bb485d85,
+        0xc2b875537c42eb69,
+        0x6faf849976873328,
+        0xfc3fcb6e81ad4cc3,
+        0x180dd95503955a28,
+        0xd40f19a3c9fe1520,
+        0x49d178ddbf7fd96d,
+        0x3950bee2e10e0297,
+        0x437b90cf295be062,
+        0xa5cd126edffad23b,
+    ],
+    [
+        0xdf58134c134491c2,
+        0x0677eca229d9f7bd,
+        0x492200a1f7d83a3c,
+        0xafb58c9810a43645,
+        0x7659077c5a9c208e,
+        0x30b4bc83706995cd,
+        0xc98fa77bbbef3a3b,
+        0x84a82905750b3109,
+        0x72f2a02326aeb69b,
+        0x8d27a2a2d73a848a,
+        0xaa9e30a80bde4b68,
+        0x63abb1415e050474,
+    ],
+    [
+        0x1c4bd1e816050a7e,
+        0x15d1502e4f469dfd,
+        0x53989d594b0c4cd8,
+        0x7a1a4c83cb7e377e,
+        0x1b52f8a9944e480e,
+        0xeb7b03f76a91a79e,
+        0x0073a4fc9328c69e,
+        0x2c7b16f8620d9de4,
+        0x950d052963e46bc4,
+        0x8d201ba1a9c89fac,
+        0xd3502941bdf35503,
+        0x7c6dfcd5af8676fb,
+    ],
+    [
+        0xf8a6cd02e92cdb0b,
+        0x6e7500f3a5464b22,
+        0x07637eabba4bdd20,
+        0x88b82717beee0e14,
+        0xbaa2b1cd3dd4c79a,
+        0xdfecc3aebec4cfa6,
+        0x7561087b0cff0166,
+        0x538fcac317a703a6,
+        0xd7d6c6eeeeeeea19,
+        0xd647b1ee441658a0,
+        0xdf4442110236c546,
+        0x559ef2c6dd73ec15,
+    ],
+    [
+        0x4c0f5fc6c0dda3d1,
+        0x685010cc3100cea7,
+        0x2fb6ba8aa0344440,
+        0xb515f0a3ca75f1fb,
+        0x886887eaecb87c10,
+        0xf03ec3fd710abb04,
+        0xd3b4763e17f543ef,
+        0x50d9e5716e78083a,
+        0x0bce2385cf8d74ff,
+        0xaf23032cd5f0e04b,
+        0xd366aa112b6159d9,
+        0x810a3ad3ac7979db,
+    ],
+    [
+        0x0a4a11d794be40a2,
+        0xeebf0cf23b668a3f,
+        0x600873fb011d761b,
+        0x0bfb5591a02ff618,
+        0xa16e2a528910af52,
+        0xf6553653e2878421,
+        0xccbe7c7a601a30c0,
+        0xb18b214fe489f5b3,
+        0xe21017ab9e153425,
+        0x586099ede17af9a6,
+        0x385078b514f50647,
+        0xc02b3a9afb89883d,
+    ],
+    [
+        0x6d3fbd3b4a9f1de6,
+        0x4b4d40a41b0f473c,
+        0x838f1887b8f31711,
+        0x9396895be5c58a41,
+        0x6247a479d66fc2e3,
+        0x13fe228a98f2d0a2,
+        0x5ba5fde765f9481e,
+        0xafb89fa62267e117,
+        0xfa4dc1bebcaa6333,
+        0xdbab590882b87289,
+        0xc3b6c08e23ba9301,
+        0xd84b5de94a324fb7,
+    ],
+    [
+        0x0d0c371c5b35b850,
+        0x7964f570e7188038,
+        0x5daf18bbd996604c,
+        0x6743bc47b9595258,
+        0x5528b9362c59bb71,
+        0xac45e25b7127b68c,
+        0xa2077d7dfbb606b6,
+        0xf3faac6faee378af,
+        0x0c6388b51545e884,
+        0xd27dbb6944917b61,
+        0x89bcac584344c104,
+        0x856bab802ce7402d,
+    ],
+    [
+        0x2cff3000be1fcd0a,
+        0x765f2977fa72a917,
+        0x1443711329f5f9d5,
+        0xd35cd0261af2f951,
+        0x2a1bb986084ec281,
+        0x2334a54b758f23f2,
+        0xa9b8cb612caf706b,
+        0xb6ba11c4ab1a1017,
+        0xde96b0824b4b46e2,
+        0xc59d4272c6d92e2c,
+        0x389bb5107611754d,
+        0x23647fbc77657372,
+    ],
+    [
+        0xd5ef60d6f76a42fa,
+        0xebb406bb79ac9819,
+        0x55faccc709a2f423,
+        0xd9d6ea97490091cd,
+        0xef3ce5069647a7e4,
+        0xdf31625d3fa78464,
+        0x242e60fd68f10f66,
+        0x39c966cc815f084d,
+        0x20e2e22e02bae3f7,
+        0xb38919d3f1173d7c,
+        0xf17769f6c77084d9,
+        0xcc051d8094cac41f,
+    ],
+    [
+        0x942069f5d6eece7e,
+        0x8d61d3e6f141c572,
+        0xc5cef9d85dd605f4,
+        0x938f2ac2bf885997,
+        0x23bddbace7c48f6c,
+        0xc90a6c5ba98537e4,
+        0x0be6ee2cca90f6ae,
+        0xa026175394ae0e90,
+        0x29fca3e314c77628,
+        0x2aa2aa8738ab7b77,
+        0xe11bbd31fbb8cac6,
+        0xb5bbbef1b78a23af,
+    ],
+    [
+        0x8b62a5551e9a9797,
+        0x3f91073d4d491c80,
+        0x4cfa44976396424a,
+        0xf8dcb2dfb3aa1b44,
+        0x3849409eba1a95f5,
+        0x070845799f234380,
+        0x184c0093667da1ba,
+        0xbd66aafccd51601e,
+        0xee6d14e92155b490,
+        0x626f2ec1865bc544,
+        0x1bd2854bf6485986,
+        0x368b8497472f12ef,
+    ],
+    [
+        0x4f88cdcdfb791921,
+        0xe2c0acfeda9ae781,
+        0x9739bc21773469b3,
+        0x00ce3ad64dc4bb8f,
+        0xaab85a321ee7a4c8,
+        0xd5de825be97004f4,
+        0x48d676d3a043b1c6,
+        0x9c6180b1ff643097,
+        0x34882a89dd590b09,
+        0xae7e6b0d249c3b1d,
+        0x8c016908a04885a1,
+        0x83ebaaebc9ae0721,
+    ],
+    [
+        0xab21b42e0f642307,
+        0xdb46631f62bb29c1,
+        0xef29f0399e09b5d9,
+        0x5b52fbb3613b8ba1,
+        0x57e129fcc96922e6,
+        0xcdeb14c9d9204b3a,
+        0x1341ef0da8536e34,
+        0xd7e3400f2bacde63,
+        0x6911eeb42f70d7e5,
+        0xc3a2a910a4679767,
+        0x1773cbe4a0f6bb28,
+        0xe17b0d53e843eab5,
+    ],
+    [
+        0x587fa39990b62800,
+        0x0d5d32788135879d,
+        0x277f7b31fd3a4cdb,
+        0xa435290ee56d7efa,
+        0xea6f40be35159925,
+        0xcb73377a506171cb,
+        0xe43c367ce731d82a,
+        0x6eb305031ca10c43,
+        0xc019a8c622cc84cb,
+        0xd5614f5658c612e6,
+        0x7b1ecbe957c3ff98,
+        0x60db6ee9651a8478,
+    ],
+    // Terminal full rounds (4)
+    [
+        0x9271d450fc9b4117,
+        0xcffeea06b6e3aac1,
+        0xfa4a44c748d1cd8e,
+        0xe64db01ba569b469,
+        0xd31005160e4045fe,
+        0x39e0fa013e025f79,
+        0xe243be574196a956,
+        0x205b2a681e3d2642,
+        0x79cae5ad93486bab,
+        0xfdf567844e32c295,
+        0x331679589bfb7189,
+        0xaf06ee32297b89c2,
+    ],
+    [
+        0xa6bcae311e498491,
+        0x9d16f52c96ac8b3e,
+        0x48a674b59393fa35,
+        0x0f9e65da3fde3796,
+        0x1e098310fc84578c,
+        0x559ae5fab1ae8dad,
+        0x56bd4d624078881d,
+        0xfd8bbbf8fbe817b5,
+        0x82d30695c44df534,
+        0x3ec0a97bc41127c5,
+        0x1eb8b64adaa22078,
+        0x82c45e418d60c983,
+    ],
+    [
+        0xb092280f484d55bf,
+        0xcd317c9537697939,
+        0xd3be2e352feb79f3,
+        0xca6d866539a390e5,
+        0xb5efb1a494e55ee6,
+        0xfa9013ac89756e9e,
+        0xaeb88efd1e981242,
+        0x13ee477cdab6e0dc,
+        0xce7df902c40da2d3,
+        0xf3fbaf0d4e6f5f34,
+        0xf96354ada6785f38,
+        0x13b5692812406886,
+    ],
+    [
+        0xf03cae030a0f4418,
+        0x7d3172887aa98e1a,
+        0x8a2c2644f2faf7b9,
+        0x80d721abee696d00,
+        0x27c8b903a4d68267,
+        0xaf0b7b12f90291b8,
+        0x00acd08cfdff3817,
+        0x4659ee496c634328,
+        0xf5b25c10730dbff1,
+        0xdde3a153297329c2,
+        0x50c0b70d6910a44b,
+        0x23c7426af725a6a0,
+    ],
+]);
+
+/// Create the default width-8 Poseidon1 permutation for Goldilocks.
+///
+/// Returns the platform-optimal implementation: dual-dispatch on aarch64
+/// (generic for scalar, fused ASM for packed), generic Karatsuba on all
+/// other platforms.
+#[cfg(target_arch = "aarch64")]
+pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> {
+    let constants = Poseidon1Constants {
+        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+        mds_circ_col: MATRIX_CIRC_MDS_8_COL,
+        round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(),
+    };
+    let (full, partial) = constants.to_optimized();
+    let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial);
+    crate::Poseidon1GoldilocksDispatch::new(fused, full, partial)
+}
+
+/// Create the default width-8 Poseidon1 permutation for Goldilocks.
+///
+/// Returns the platform-optimal implementation: fused ASM on aarch64,
+/// generic Karatsuba on all other platforms.
+#[cfg(not(target_arch = "aarch64"))]
+pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> {
+    Poseidon1::new(&Poseidon1Constants {
+        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+        mds_circ_col: MATRIX_CIRC_MDS_8_COL,
+        round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(),
+    })
+}
+
+/// Create the default width-12 Poseidon1 permutation for Goldilocks.
+///
+/// Returns the platform-optimal implementation: dual-dispatch on aarch64
+/// (generic for scalar, fused ASM for packed), generic Karatsuba on all
+/// other platforms.
+#[cfg(target_arch = "aarch64")]
+pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> {
+    let constants = Poseidon1Constants {
+        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12,
+        mds_circ_col: MATRIX_CIRC_MDS_12_COL,
+        round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(),
+    };
+    let (full, partial) = constants.to_optimized();
+    let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial);
+    crate::Poseidon1GoldilocksDispatch::new(fused, full, partial)
+}
+
+/// Create the default width-12 Poseidon1 permutation for Goldilocks.
+///
+/// Returns the platform-optimal implementation: fused ASM on aarch64,
+/// generic Karatsuba on all other platforms.
+#[cfg(not(target_arch = "aarch64"))]
+pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> {
+    Poseidon1::new(&Poseidon1Constants {
+        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12,
+        mds_circ_col: MATRIX_CIRC_MDS_12_COL,
+        round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_symmetric::Permutation;
+    use rand::SeedableRng;
+    use rand::rngs::SmallRng;
+
+    use super::*;
+
+    type F = Goldilocks;
+
+    /// Known-answer test for width 8 (sequential 0..7 input).
+    #[test]
+    fn test_poseidon_goldilocks_width_8() {
+        let perm = default_goldilocks_poseidon1_8();
+
+        let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
+        perm.permute_mut(&mut input);
+
+        let expected: [F; 8] = F::new_array([
+            2431226948502761687,
+            9427563026145807618,
+            6827549936272051660,
+            16907684411084503785,
+            10131745626715172913,
+            17448305483431576765,
+            9066501914269485014,
+            12095238468458521303,
+        ]);
+        assert_eq!(input, expected);
+    }
+
+    /// Known-answer test for width 12 (sequential 0..11 input).
+    #[test]
+    fn test_poseidon_goldilocks_width_12() {
+        let perm = default_goldilocks_poseidon1_12();
+
+        let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
+        perm.permute_mut(&mut input);
+
+        let expected: [F; 12] = F::new_array([
+            15595088881848875364,
+            9564850329150784619,
+            13607005230761744521,
+            12117102595842533385,
+            2814257411756993122,
+            11640647689983397089,
+            14363867760831937423,
+            13323891071259596526,
+            11219803511311150468,
+            9221595262780869902,
+            5898229059046891887,
+            18181291031484020550,
+        ]);
+        assert_eq!(input, expected);
+    }
+
+    /// Smoke test for width 16 with random constants.
+    /// Uses the generic type directly since the fused type only supports 8 and 12.
+    #[test]
+    fn test_poseidon_goldilocks_width_16() {
+        let mut rng = SmallRng::seed_from_u64(1);
+        let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng(
+            GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+            GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+            &MdsMatrixGoldilocks,
+            &mut rng,
+        );
+        let input: [F; 16] = rand::RngExt::random(&mut rng);
+        let output = poseidon.permute(input);
+        assert_ne!(output, input);
+    }
+
+    /// Smoke test for width 24 with random constants.
+    #[test]
+    fn test_poseidon_goldilocks_width_24() {
+        let mut rng = SmallRng::seed_from_u64(1);
+        let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng(
+            GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+            GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+            &MdsMatrixGoldilocks,
+            &mut rng,
+        );
+        let input: [F; 24] = rand::RngExt::random(&mut rng);
+        let output = poseidon.permute(input);
+        assert_ne!(output, input);
+    }
+
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+    mod avx512 {
+        use super::*;
+        use crate::PackedGoldilocksAVX512;
+
+        #[test]
+        fn test_avx512_poseidon_width_16() {
+            let mut rng = SmallRng::seed_from_u64(1);
+            let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng(
+                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+                &MdsMatrixGoldilocks,
+                &mut rng,
+            );
+            let input: [F; 16] = rand::RngExt::random(&mut rng);
+
+            let mut expected = input;
+            poseidon.permute_mut(&mut expected);
+
+            let mut avx512_input = input.map(Into::<PackedGoldilocksAVX512>::into);
+            poseidon.permute_mut(&mut avx512_input);
+
+            let avx512_output = avx512_input.map(|x| x.0[0]);
+            assert_eq!(avx512_output, expected);
+        }
+
+        #[test]
+        fn test_avx512_poseidon_width_24() {
+            let mut rng = SmallRng::seed_from_u64(1);
+            let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng(
+                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+                &MdsMatrixGoldilocks,
+                &mut rng,
+            );
+            let input: [F; 24] = rand::RngExt::random(&mut rng);
+
+            let mut expected = input;
+            poseidon.permute_mut(&mut expected);
+
+            let mut avx512_input = input.map(Into::<PackedGoldilocksAVX512>::into);
+            poseidon.permute_mut(&mut avx512_input);
+
+            let avx512_output = avx512_input.map(|x| x.0[0]);
+            assert_eq!(avx512_output, expected);
+        }
+    }
+
+    #[cfg(all(
+        target_arch = "x86_64",
+        target_feature = "avx2",
+        not(target_feature = "avx512f")
+    ))]
+    mod avx2 {
+        use super::*;
+        use crate::PackedGoldilocksAVX2;
+
+        #[test]
+        fn test_avx2_poseidon_width_16() {
+            let mut rng = SmallRng::seed_from_u64(1);
+            let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng(
+                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+                &MdsMatrixGoldilocks,
+                &mut rng,
+            );
+            let input: [F; 16] = rand::RngExt::random(&mut rng);
+
+            let mut expected = input;
+            poseidon.permute_mut(&mut expected);
+
+            let mut avx2_input = input.map(Into::<PackedGoldilocksAVX2>::into);
+            poseidon.permute_mut(&mut avx2_input);
+
+            let avx2_output = avx2_input.map(|x| x.0[0]);
+            assert_eq!(avx2_output, expected);
+        }
+
+        #[test]
+        fn test_avx2_poseidon_width_24() {
+            let mut rng = SmallRng::seed_from_u64(1);
+            let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng(
+                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
+                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
+                &MdsMatrixGoldilocks,
+                &mut rng,
+            );
+            let input: [F; 24] = rand::RngExt::random(&mut rng);
+
+            let mut expected = input;
+            poseidon.permute_mut(&mut expected);
+
+            let mut avx2_input = input.map(Into::<PackedGoldilocksAVX2>::into);
+            poseidon.permute_mut(&mut avx2_input);
+
+            let avx2_output = avx2_input.map(|x| x.0[0]);
+            assert_eq!(avx2_output, expected);
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    mod neon {
+        use super::*;
+        use crate::PackedGoldilocksNeon;
+
+        #[test]
+        fn test_neon_poseidon_width_8() {
+            let perm = default_goldilocks_poseidon1_8();
+            let input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
+
+            let mut expected = input;
+            perm.permute_mut(&mut expected);
+
+            let mut neon_input = input.map(Into::<PackedGoldilocksNeon>::into);
+            perm.permute_mut(&mut neon_input);
+
+            let neon_output = neon_input.map(|x| x.0[0]);
+            assert_eq!(neon_output, expected);
+        }
+
+        #[test]
+        fn test_neon_poseidon_width_12() {
+            let perm = default_goldilocks_poseidon1_12();
+            let input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
+
+            let mut expected = input;
+            perm.permute_mut(&mut expected);
+
+            let mut neon_input = input.map(Into::<PackedGoldilocksNeon>::into);
+            perm.permute_mut(&mut neon_input);
+
+            let neon_output = neon_input.map(|x| x.0[0]);
+            assert_eq!(neon_output, expected);
+        }
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs
new file mode 100644
index 000000000..b5d158610
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs
@@ -0,0 +1,980 @@
+//! Implementation of Poseidon2, see: https://eprint.iacr.org/2023/323
+
+use alloc::vec::Vec;
+
+use p3_field::{Algebra, InjectiveMonomial, PrimeCharacteristicRing};
+#[cfg(not(target_arch = "aarch64"))]
+use p3_poseidon2::Poseidon2;
+use p3_poseidon2::{
+    ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, GenericPoseidon2LinearLayers,
+    InternalLayer, InternalLayerConstructor, MDSMat4, add_rc_and_sbox_generic,
+    external_initial_permute_state, external_terminal_permute_state, internal_permute_state,
+    matmul_internal,
+};
+
+use crate::Goldilocks;
+use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE;
+
+/// Number of full rounds per half for Goldilocks Poseidon2 (`RF / 2`).
+///
+/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending).
+/// Follows the Poseidon2 paper's security analysis with a +2 RF margin.
+pub const GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS: usize = 4;
+
+/// Number of partial rounds for Goldilocks Poseidon2 (width 8).
+///
+/// Derived from the interpolation bound in the Poseidon paper (Eq. 3):
+///
+///   R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5
+///            = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20
+///
+/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
+pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8: usize = 22;
+
+/// Number of partial rounds for Goldilocks Poseidon2 (width 12).
+///
+/// Same interpolation bound as width 8:
+///
+///   R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20
+///
+/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
+pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_12: usize = 22;
+
+/// An implementation of the Poseidon2 hash function for the Goldilocks field.
+///
+/// It acts on arrays of the form `[Goldilocks; WIDTH]`.
+#[cfg(target_arch = "aarch64")]
+pub type Poseidon2Goldilocks<const WIDTH: usize> = crate::Poseidon2GoldilocksFused<WIDTH>;
+
+/// An implementation of the Poseidon2 hash function for the Goldilocks field.
+///
+/// It acts on arrays of the form `[Goldilocks; WIDTH]`.
+#[cfg(not(target_arch = "aarch64"))]
+pub type Poseidon2Goldilocks<const WIDTH: usize> = Poseidon2<
+    Goldilocks,
+    Poseidon2ExternalLayerGoldilocks<WIDTH>,
+    Poseidon2InternalLayerGoldilocks,
+    WIDTH,
+    GOLDILOCKS_S_BOX_DEGREE,
+>;
+
+/// Round constants for width-8 Poseidon2 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
+///
+/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
+///
+/// Layout: external_initial (4 rounds × 8 elements).
+pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL: [[Goldilocks; 8]; 4] = [
+    Goldilocks::new_array([
+        0xdd5743e7f2a5a5d9,
+        0xcb3a864e58ada44b,
+        0xffa2449ed32f8cdc,
+        0x42025f65d6bd13ee,
+        0x7889175e25506323,
+        0x34b98bb03d24b737,
+        0xbdcc535ecc4faa2a,
+        0x5b20ad869fc0d033,
+    ]),
+    Goldilocks::new_array([
+        0xf1dda5b9259dfcb4,
+        0x27515210be112d59,
+        0x4227d1718c766c3f,
+        0x26d333161a5bd794,
+        0x49b938957bf4b026,
+        0x4a56b5938b213669,
+        0x1120426b48c8353d,
+        0x6b323c3f10a56cad,
+    ]),
+    Goldilocks::new_array([
+        0xce57d6245ddca6b2,
+        0xb1fc8d402bba1eb1,
+        0xb5c5096ca959bd04,
+        0x6db55cd306d31f7f,
+        0xc49d293a81cb9641,
+        0x1ce55a4fe979719f,
+        0xa92e60a9d178a4d1,
+        0x002cc64973bcfd8c,
+    ]),
+    Goldilocks::new_array([
+        0xcea721cce82fb11b,
+        0xe5b55eb8098ece81,
+        0x4e30525c6f1ddd66,
+        0x43c6702827070987,
+        0xaca68430a7b5762a,
+        0x3674238634df9c93,
+        0x88cee1c825e33433,
+        0xde99ae8d74b57176,
+    ]),
+];
+
+/// Round constants for width-8 Poseidon2 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
+///
+/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
+///
+/// Layout: external_final (4 rounds × 8 elements).
+pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL: [[Goldilocks; 8]; 4] = [
+    Goldilocks::new_array([
+        0x014ef1197d341346,
+        0x9725e20825d07394,
+        0xfdb25aef2c5bae3b,
+        0xbe5402dc598c971e,
+        0x93a5711f04cdca3d,
+        0xc45a9a5b2f8fb97b,
+        0xfe8946a924933545,
+        0x2af997a27369091c,
+    ]),
+    Goldilocks::new_array([
+        0xaa62c88e0b294011,
+        0x058eb9d810ce9f74,
+        0xb3cb23eced349ae4,
+        0xa3648177a77b4a84,
+        0x43153d905992d95d,
+        0xf4e2a97cda44aa4b,
+        0x5baa2702b908682f,
+        0x082923bdf4f750d1,
+    ]),
+    Goldilocks::new_array([
+        0x98ae09a325893803,
+        0xf8a6475077968838,
+        0xceb0735bf00b2c5f,
+        0x0a1a5d953888e072,
+        0x2fcb190489f94475,
+        0xb5be06270dec69fc,
+        0x739cb934b09acf8b,
+        0x537750b75ec7f25b,
+    ]),
+    Goldilocks::new_array([
+        0xe9dd318bae1f3961,
+        0xf7462137299efe1a,
+        0xb1f6b8eee9adb940,
+        0xbdebcc8a809dfe6b,
+        0x40fc1f791b178113,
+        0x3ac1c3362d014864,
+        0x9a016184bdb8aeba,
+        0x95f2394459fbc25e,
+    ]),
+];
+
+/// Round constants for width-8 Poseidon2 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
+///
+/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
+///
+/// Layout: internal (22 scalar constants).
+pub const GOLDILOCKS_POSEIDON2_RC_8_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([
+    0x488897d85ff51f56,
+    0x1140737ccb162218,
+    0xa7eeb9215866ed35,
+    0x9bd2976fee49fcc9,
+    0xc0c8f0de580a3fcc,
+    0x4fb2dae6ee8fc793,
+    0x343a89f35f37395b,
+    0x223b525a77ca72c8,
+    0x56ccb62574aaa918,
+    0xc4d507d8027af9ed,
+    0xa080673cf0b7e95c,
+    0xf0184884eb70dcf8,
+    0x044f10b0cb3d5c69,
+    0xe9e3f7993938f186,
+    0x1b761c80e772f459,
+    0x606cec607a1b5fac,
+    0x14a0c2e1d45f03cd,
+    0x4eace8855398574f,
+    0xf905ca7103eff3e6,
+    0xf8c8f8d20862c059,
+    0xb524fe8bdd678e5a,
+    0xfbb7865901a1ec41,
+]);
+
+/// Round constants for width-12 Poseidon2 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
+///
+/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
+///
+/// Layout: external_initial (4 rounds × 12 elements).
+pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL: [[Goldilocks; 12]; 4] = [
+    Goldilocks::new_array([
+        0x13dcf33aba214f46,
+        0x30b3b654a1da6d83,
+        0x1fc634ada6159b56,
+        0x937459964dc03466,
+        0xedd2ef2ca7949924,
+        0xede9affde0e22f68,
+        0x8515b9d6bac9282d,
+        0x6b5c07b4e9e900d8,
+        0x1ec66368838c8a08,
+        0x9042367d80d1fbab,
+        0x400283564a3c3799,
+        0x4a00be0466bca75e,
+    ]),
+    Goldilocks::new_array([
+        0x7913beee58e3817f,
+        0xf545e88532237d90,
+        0x22f8cb8736042005,
+        0x6f04990e247a2623,
+        0xfe22e87ba37c38cd,
+        0xd20e32c85ffe2815,
+        0x117227674048fe73,
+        0x4e9fb7ea98a6b145,
+        0xe0866c232b8af08b,
+        0x00bbc77916884964,
+        0x7031c0fb990d7116,
+        0x240a9e87cf35108f,
+    ]),
+    Goldilocks::new_array([
+        0x2e6363a5a12244b3,
+        0x5e1c3787d1b5011c,
+        0x4132660e2a196e8b,
+        0x3a013b648d3d4327,
+        0xf79839f49888ea43,
+        0xfe85658ebafe1439,
+        0xb6889825a14240bd,
+        0x578453605541382b,
+        0x4508cda8f6b63ce9,
+        0x9c3ef35848684c91,
+        0x0812bde23c87178c,
+        0xfe49638f7f722c14,
+    ]),
+    Goldilocks::new_array([
+        0x8e3f688ce885cbf5,
+        0xb8e110acf746a87d,
+        0xb4b2e8973a6dabef,
+        0x9e714c5da3d462ec,
+        0x6438f9033d3d0c15,
+        0x24312f7cf1a27199,
+        0x23f843bb47acbf71,
+        0x9183f11a34be9f01,
+        0x839062fbb9d45dbf,
+        0x24b56e7e6c2e43fa,
+        0xe1683da61c962a72,
+        0xa95c63971a19bfa7,
+    ]),
+];
+
+/// Round constants for width-12 Poseidon2 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
+///
+/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
+///
+/// Layout: external_final (4 rounds × 12 elements).
+pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL: [[Goldilocks; 12]; 4] = [
+    Goldilocks::new_array([
+        0xc68be7c94882a24d,
+        0xaf996d5d5cdaedd9,
+        0x9717f025e7daf6a5,
+        0x6436679e6e7216f4,
+        0x8a223d99047af267,
+        0xbb512e35a133ba9a,
+        0xfbbf44097671aa03,
+        0xf04058ebf6811e61,
+        0x5cca84703fac7ffb,
+        0x9b55c7945de6469f,
+        0x8e05bf09808e934f,
+        0x2ea900de876307d7,
+    ]),
+    Goldilocks::new_array([
+        0x7748fff2b38dfb89,
+        0x6b99a676dd3b5d81,
+        0xac4bb7c627cf7c13,
+        0xadb6ebe5e9e2f5ba,
+        0x2d33378cafa24ae3,
+        0x1e5b73807543f8c2,
+        0x09208814bfebb10f,
+        0x782e64b6bb5b93dd,
+        0xadd5a48eac90b50f,
+        0xadd4c54c736ea4b1,
+        0xd58dbb86ed817fd8,
+        0x6d5ed1a533f34ddd,
+    ]),
+    Goldilocks::new_array([
+        0x28686aa3e36b7cb9,
+        0x591abd3476689f36,
+        0x047d766678f13875,
+        0xa2a11112625f5b49,
+        0x21fd10a3f8304958,
+        0xf9b40711443b0280,
+        0xd2697eb8b2bde88e,
+        0x3493790b51731b3f,
+        0x11caf9dd73764023,
+        0x7acfb8f72878164e,
+        0x744ec4db23cefc26,
+        0x1e00e58f422c6340,
+    ]),
+    Goldilocks::new_array([
+        0x21dd28d906a62dda,
+        0xf32a46ab5f465b5f,
+        0xbfce13201f3f7e6b,
+        0xf30d2e7adb5304e2,
+        0xecdf4ee4abad48e9,
+        0xf94e82182d395019,
+        0x4ee52e3744d887c5,
+        0xa1341c7cac0083b2,
+        0x2302fb26c30c834a,
+        0xaea3c587273bf7d3,
+        0xf798e24961823ec7,
+        0x962deba3e9a2cd94,
+    ]),
+];
+
+/// Round constants for width-12 Poseidon2 on Goldilocks.
+///
+/// Generated by the Grain LFSR with parameters:
+///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
+///
+/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
+///
+/// Layout: internal (22 scalar constants).
+pub const GOLDILOCKS_POSEIDON2_RC_12_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([
+    0x4adf842aa75d4316,
+    0xf8fbb871aa4ab4eb,
+    0x68e85b6eb2dd6aeb,
+    0x07a0b06b2d270380,
+    0xd94e0228bd282de4,
+    0x8bdd91d3250c5278,
+    0x209c68b88bba778f,
+    0xb5e18cdab77f3877,
+    0xb296a3e808da93fa,
+    0x8370ecbda11a327e,
+    0x3f9075283775dad8,
+    0xb78095bb23c6aa84,
+    0x3f36b9fe72ad4e5f,
+    0x69bc96780b10b553,
+    0x3f1d341f2eb7b881,
+    0x4e939e9815838818,
+    0xda366b3ae2a31604,
+    0xbc89db1e7287d509,
+    0x6102f411f9ef5659,
+    0x58725c5e7ac1f0ab,
+    0x0df5856c798883e7,
+    0xf7bb62a8da4c961b,
+]);
+
+/// Create a default width-8 Poseidon2 permutation for Goldilocks.
+#[cfg(not(target_arch = "aarch64"))]
+pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> {
+    Poseidon2::new(
+        ExternalLayerConstants::new(
+            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(),
+            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(),
+        ),
+        GOLDILOCKS_POSEIDON2_RC_8_INTERNAL.to_vec(),
+    )
+}
+
+/// Create a default width-8 Poseidon2 permutation for Goldilocks.
+#[cfg(target_arch = "aarch64")]
+pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> {
+    crate::Poseidon2GoldilocksFused::new(
+        &ExternalLayerConstants::new(
+            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(),
+            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(),
+        ),
+        &GOLDILOCKS_POSEIDON2_RC_8_INTERNAL,
+    )
+}
+
+/// Create a default width-12 Poseidon2 permutation for Goldilocks.
+#[cfg(not(target_arch = "aarch64"))]
+pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> {
+    Poseidon2::new(
+        ExternalLayerConstants::new(
+            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(),
+            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(),
+        ),
+        GOLDILOCKS_POSEIDON2_RC_12_INTERNAL.to_vec(),
+    )
+}
+
+/// Create a default width-12 Poseidon2 permutation for Goldilocks.
+#[cfg(target_arch = "aarch64")]
+pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> {
+    crate::Poseidon2GoldilocksFused::new(
+        &ExternalLayerConstants::new(
+            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(),
+            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(),
+        ),
+        &GOLDILOCKS_POSEIDON2_RC_12_INTERNAL,
+    )
+}
+
+pub const MATRIX_DIAG_8_GOLDILOCKS: [Goldilocks; 8] = Goldilocks::new_array([
+    0xfffffffeffffffff, // -2
+    0x0000000000000001, // 1
+    0x0000000000000002, // 2
+    0x7fffffff80000001, // 1/2
+    0x0000000000000003, // 3
+    0x7fffffff80000000, // -1/2
+    0xfffffffefffffffe, // -3
+    0xfffffffefffffffd, // -4
+]);
+
+pub const MATRIX_DIAG_12_GOLDILOCKS: [Goldilocks; 12] = Goldilocks::new_array([
+    0xfffffffeffffffff, // -2
+    0x0000000000000001, // 1
+    0x0000000000000002, // 2
+    0x7fffffff80000001, // 1/2
+    0x0000000000000003, // 3
+    0x0000000000000004, // 4
+    0x7fffffff80000000, // -1/2
+    0xfffffffefffffffe, // -3
+    0xfffffffefffffffd, // -4
+    0xbfffffff40000001, // 1/2^2
+    0x3fffffffc0000000, // -1/2^2
+    0xdfffffff20000001, // 1/2^3
+]);
+
+pub const MATRIX_DIAG_16_GOLDILOCKS: [Goldilocks; 16] = Goldilocks::new_array([
+    0xfffffffeffffffff, // -2
+    0x0000000000000001, // 1
+    0x0000000000000002, // 2
+    0x7fffffff80000001, // 1/2
+    0x0000000000000003, // 3
+    0x0000000000000004, // 4
+    0x7fffffff80000000, // -1/2
+    0xfffffffefffffffe, // -3
+    0xfffffffefffffffd, // -4
+    0xdfffffff20000001, // 1/2^3
+    0xefffffff10000001, // 1/2^4
+    0xf7ffffff08000001, // 1/2^5
+    0x1fffffffe0000000, // -1/2^3
+    0x0ffffffff0000000, // -1/2^4
+    0x07fffffff8000000, // -1/2^5
+    0xfffffffe00000002, // 1/2^32
+]);
+
+pub const MATRIX_DIAG_20_GOLDILOCKS: [Goldilocks; 20] = Goldilocks::new_array([
+    0x95c381fda3b1fa57,
+    0xf36fe9eb1288f42c,
+    0x89f5dcdfef277944,
+    0x106f22eadeb3e2d2,
+    0x684e31a2530e5111,
+    0x27435c5d89fd148e,
+    0x3ebed31c414dbf17,
+    0xfd45b0b2d294e3cc,
+    0x48c904473a7f6dbf,
+    0xe0d1b67809295b4d,
+    0xddd1941e9d199dcb,
+    0x8cfe534eeb742219,
+    0xa6e5261d9e3b8524,
+    0x6897ee5ed0f82c1b,
+    0x0e7dcd0739ee5f78,
+    0x493253f3d0d32363,
+    0xbb2737f5845f05c0,
+    0xa187e810b06ad903,
+    0xb635b995936c4918,
+    0x0b3694a940bd2394,
+]);
+
+fn internal_layer_mat_mul_goldilocks_8<A: Algebra<Goldilocks>>(state: &mut [A; 8]) {
+    let sum: A = state.iter().map(|r| r.dup()).sum();
+
+    let s0 = state[0].dup();
+    let s1 = state[1].dup();
+    let s2 = state[2].dup();
+    let s3 = state[3].dup();
+    let s4 = state[4].dup();
+    let s5 = state[5].dup();
+    let s6 = state[6].dup();
+    let s7 = state[7].dup();
+
+    // V[0] = -2
+    let two_s0 = s0.dup() + s0;
+    state[0] = sum.dup() - two_s0;
+
+    // V[1] = 1
+    state[1] = sum.dup() + s1;
+
+    // V[2] = 2
+    let two_s2 = s2.dup() + s2;
+    state[2] = sum.dup() + two_s2;
+
+    // V[3] = 1/2
+    state[3] = sum.dup() + s3.halve();
+
+    // V[4] = 3
+    let two_s4 = s4.dup() + s4.dup();
+    let three_s4 = two_s4 + s4;
+    state[4] = sum.dup() + three_s4;
+
+    // V[5] = -1/2
+    state[5] = sum.dup() - s5.halve();
+
+    // V[6] = -3
+    let two_s6 = s6.dup() + s6.dup();
+    let three_s6 = two_s6 + s6;
+    state[6] = sum.dup() - three_s6;
+
+    // V[7] = -4
+    let two_s7 = s7.dup() + s7;
+    let four_s7 = two_s7.dup() + two_s7;
+    state[7] = sum - four_s7;
+}
+
+fn internal_layer_mat_mul_goldilocks_12<A: Algebra<Goldilocks>>(state: &mut [A; 12]) {
+    let sum: A = state.iter().map(|r| r.dup()).sum();
+
+    let s0 = state[0].dup();
+    let s1 = state[1].dup();
+    let s2 = state[2].dup();
+    let s3 = state[3].dup();
+    let s4 = state[4].dup();
+    let s5 = state[5].dup();
+    let s6 = state[6].dup();
+    let s7 = state[7].dup();
+    let s8 = state[8].dup();
+    let s9 = state[9].dup();
+    let s10 = state[10].dup();
+    let s11 = state[11].dup();
+
+    // V[0] = -2
+    let two_s0 = s0.dup() + s0;
+    state[0] = sum.dup() - two_s0;
+
+    // V[1] = 1
+    state[1] = sum.dup() + s1;
+
+    // V[2] = 2
+    let two_s2 = s2.dup() + s2;
+    state[2] = sum.dup() + two_s2;
+
+    // V[3] = 1/2
+    state[3] = sum.dup() + s3.halve();
+
+    // V[4] = 3
+    let two_s4 = s4.dup() + s4.dup();
+    let three_s4 = two_s4 + s4;
+    state[4] = sum.dup() + three_s4;
+
+    // V[5] = 4
+    let two_s5 = s5.dup() + s5;
+    let four_s5 = two_s5.dup() + two_s5;
+    state[5] = sum.dup() + four_s5;
+
+    // V[6] = -1/2
+    state[6] = sum.dup() - s6.halve();
+
+    // V[7] = -3
+    let two_s7 = s7.dup() + s7.dup();
+    let three_s7 = two_s7 + s7;
+    state[7] = sum.dup() - three_s7;
+
+    // V[8] = -4
+    let two_s8 = s8.dup() + s8;
+    let four_s8 = two_s8.dup() + two_s8;
+    state[8] = sum.dup() - four_s8;
+
+    // V[9] = 1/2^2
+    state[9] = sum.dup() + s9.halve().halve();
+
+    // V[10] = -1/2^2
+    state[10] = sum.dup() - s10.halve().halve();
+
+    // V[11] = 1/2^3
+    state[11] = sum + s11.halve().halve().halve();
+}
+
+fn internal_layer_mat_mul_goldilocks_16<A: Algebra<Goldilocks>>(state: &mut [A; 16]) {
+    let sum: A = state.iter().map(|r| r.dup()).sum();
+
+    let s0 = state[0].dup();
+    let s1 = state[1].dup();
+    let s2 = state[2].dup();
+    let s3 = state[3].dup();
+    let s4 = state[4].dup();
+    let s5 = state[5].dup();
+    let s6 = state[6].dup();
+    let s7 = state[7].dup();
+    let s8 = state[8].dup();
+    let s9 = state[9].dup();
+    let s10 = state[10].dup();
+    let s11 = state[11].dup();
+    let s12 = state[12].dup();
+    let s13 = state[13].dup();
+    let s14 = state[14].dup();
+    let s15 = state[15].dup();
+
+    // V[0] = -2
+    let two_s0 = s0.dup() + s0;
+    state[0] = sum.dup() - two_s0;
+
+    // V[1] = 1
+    state[1] = sum.dup() + s1;
+
+    // V[2] = 2
+    let two_s2 = s2.dup() + s2;
+    state[2] = sum.dup() + two_s2;
+
+    // V[3] = 1/2
+    state[3] = sum.dup() + s3.halve();
+
+    // V[4] = 3
+    let two_s4 = s4.dup() + s4.dup();
+    let three_s4 = two_s4 + s4;
+    state[4] = sum.dup() + three_s4;
+
+    // V[5] = 4
+    let two_s5 = s5.dup() + s5;
+    let four_s5 = two_s5.dup() + two_s5;
+    state[5] = sum.dup() + four_s5;
+
+    // V[6] = -1/2
+    state[6] = sum.dup() - s6.halve();
+
+    // V[7] = -3
+    let two_s7 = s7.dup() + s7.dup();
+    let three_s7 = two_s7 + s7;
+    state[7] = sum.dup() - three_s7;
+
+    // V[8] = -4
+    let two_s8 = s8.dup() + s8;
+    let four_s8 = two_s8.dup() + two_s8;
+    state[8] = sum.dup() - four_s8;
+
+    // V[9] = 1/2^3
+    state[9] = sum.dup() + s9.halve().halve().halve();
+
+    // V[10] = 1/2^4
+    state[10] = sum.dup() + s10.halve().halve().halve().halve();
+
+    // V[11] = 1/2^5
+    state[11] = sum.dup() + s11.halve().halve().halve().halve().halve();
+
+    // V[12] = -1/2^3
+    state[12] = sum.dup() - s12.halve().halve().halve();
+
+    // V[13] = -1/2^4
+    state[13] = sum.dup() - s13.halve().halve().halve().halve();
+
+    // V[14] = -1/2^5
+    state[14] = sum.dup() - s14.halve().halve().halve().halve().halve();
+
+    // V[15] = 1/2^32
+    let inv_2_32 = MATRIX_DIAG_16_GOLDILOCKS[15];
+    let v15 = s15 * inv_2_32;
+    state[15] = sum + v15;
+}
+
+/// The internal layers of the Poseidon2 permutation.
+#[derive(Debug, Clone, Default)]
+pub struct Poseidon2InternalLayerGoldilocks {
+    internal_constants: Vec<Goldilocks>,
+}
+
+impl InternalLayerConstructor<Goldilocks> for Poseidon2InternalLayerGoldilocks {
+    fn new_from_constants(internal_constants: Vec<Goldilocks>) -> Self {
+        Self { internal_constants }
+    }
+}
+
+impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
+    InternalLayer<A, 8, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
+{
+    /// Perform the internal layers of the Poseidon2 permutation on the given state.
+    fn permute_state(&self, state: &mut [A; 8]) {
+        internal_permute_state(
+            state,
+            internal_layer_mat_mul_goldilocks_8,
+            &self.internal_constants,
+        );
+    }
+}
+
+impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
+    InternalLayer<A, 12, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
+{
+    /// Perform the internal layers of the Poseidon2 permutation on the given state.
+    fn permute_state(&self, state: &mut [A; 12]) {
+        internal_permute_state(
+            state,
+            internal_layer_mat_mul_goldilocks_12,
+            &self.internal_constants,
+        );
+    }
+}
+
+impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
+    InternalLayer<A, 16, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
+{
+    /// Perform the internal layers of the Poseidon2 permutation on the given state.
+    fn permute_state(&self, state: &mut [A; 16]) {
+        internal_permute_state(
+            state,
+            internal_layer_mat_mul_goldilocks_16,
+            &self.internal_constants,
+        );
+    }
+}
+
+impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
+    InternalLayer<A, 20, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
+{
+    /// Perform the internal layers of the Poseidon2 permutation on the given state.
+    fn permute_state(&self, state: &mut [A; 20]) {
+        internal_permute_state(
+            state,
+            |x| matmul_internal(x, MATRIX_DIAG_20_GOLDILOCKS),
+            &self.internal_constants,
+        );
+    }
+}
+
+/// The external layers of the Poseidon2 permutation.
+#[derive(Clone)]
+pub struct Poseidon2ExternalLayerGoldilocks<const WIDTH: usize> {
+    pub(crate) external_constants: ExternalLayerConstants<Goldilocks, WIDTH>,
+}
+
+impl<const WIDTH: usize> ExternalLayerConstructor<Goldilocks, WIDTH>
+    for Poseidon2ExternalLayerGoldilocks<WIDTH>
+{
+    fn new_from_constants(external_constants: ExternalLayerConstants<Goldilocks, WIDTH>) -> Self {
+        Self { external_constants }
+    }
+}
+
+impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>, const WIDTH: usize>
+    ExternalLayer<A, WIDTH, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2ExternalLayerGoldilocks<WIDTH>
+{
+    /// Perform the initial external layers of the Poseidon2 permutation on the given state.
+    fn permute_state_initial(&self, state: &mut [A; WIDTH]) {
+        external_initial_permute_state(
+            state,
+            self.external_constants.get_initial_constants(),
+            add_rc_and_sbox_generic,
+            &MDSMat4,
+        );
+    }
+
+    /// Perform the terminal external layers of the Poseidon2 permutation on the given state.
+    fn permute_state_terminal(&self, state: &mut [A; WIDTH]) {
+        external_terminal_permute_state(
+            state,
+            self.external_constants.get_terminal_constants(),
+            add_rc_and_sbox_generic,
+            &MDSMat4,
+        );
+    }
+}
+
+/// An implementation of the matrix multiplications in the internal and external layers of Poseidon2.
+///
+/// This can act on `[A; WIDTH]` for any ring implementing `Algebra<Goldilocks>`.
+/// If you have either `[Goldilocks::Packing; WIDTH]` or `[Goldilocks; WIDTH]` it will be much faster
+/// to use `Poseidon2Goldilocks<WIDTH>` instead of building a Poseidon2 permutation using this.
+#[derive(Clone, Debug, Default)]
+pub struct GenericPoseidon2LinearLayersGoldilocks;
+
+impl GenericPoseidon2LinearLayers<8> for GenericPoseidon2LinearLayersGoldilocks {
+    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 8]) {
+        let sum: R = state.iter().map(|r| r.dup()).sum();
+        for i in 0..8 {
+            let d = R::from_u64(MATRIX_DIAG_8_GOLDILOCKS[i].value);
+            state[i] *= d;
+            state[i] += sum.dup();
+        }
+    }
+}
+
+impl GenericPoseidon2LinearLayers<12> for GenericPoseidon2LinearLayersGoldilocks {
+    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 12]) {
+        let sum: R = state.iter().map(|r| r.dup()).sum();
+        for i in 0..12 {
+            let d = R::from_u64(MATRIX_DIAG_12_GOLDILOCKS[i].value);
+            state[i] *= d;
+            state[i] += sum.dup();
+        }
+    }
+}
+
+impl GenericPoseidon2LinearLayers<16> for GenericPoseidon2LinearLayersGoldilocks {
+    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 16]) {
+        let sum: R = state.iter().map(|r| r.dup()).sum();
+        for i in 0..16 {
+            let d = R::from_u64(MATRIX_DIAG_16_GOLDILOCKS[i].value);
+            state[i] *= d;
+            state[i] += sum.dup();
+        }
+    }
+}
+
+impl GenericPoseidon2LinearLayers<20> for GenericPoseidon2LinearLayersGoldilocks {
+    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 20]) {
+        let sum: R = state.iter().map(|r| r.dup()).sum();
+        for i in 0..20 {
+            let d = R::from_u64(MATRIX_DIAG_20_GOLDILOCKS[i].value);
+            state[i] *= d;
+            state[i] += sum.dup();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field::PrimeCharacteristicRing;
+    use p3_symmetric::Permutation;
+
+    use super::*;
+
+    type F = Goldilocks;
+
+    #[test]
+    fn test_generic_internal_linear_layer_8_matches_matmul_internal() {
+        let mut state_generic = [
+            F::from_u64(1),
+            F::from_u64(2),
+            F::from_u64(3),
+            F::from_u64(4),
+            F::from_u64(5),
+            F::from_u64(6),
+            F::from_u64(7),
+            F::from_u64(8),
+        ];
+        let mut state_existing = state_generic;
+
+        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
+        matmul_internal(&mut state_existing, MATRIX_DIAG_8_GOLDILOCKS);
+
+        assert_eq!(state_generic, state_existing);
+    }
+
+    #[test]
+    fn test_generic_internal_linear_layer_12_matches_matmul_internal() {
+        let mut state_generic = [
+            F::from_u64(1),
+            F::from_u64(2),
+            F::from_u64(3),
+            F::from_u64(4),
+            F::from_u64(5),
+            F::from_u64(6),
+            F::from_u64(7),
+            F::from_u64(8),
+            F::from_u64(9),
+            F::from_u64(10),
+            F::from_u64(11),
+            F::from_u64(12),
+        ];
+        let mut state_existing = state_generic;
+
+        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
+        matmul_internal(&mut state_existing, MATRIX_DIAG_12_GOLDILOCKS);
+
+        assert_eq!(state_generic, state_existing);
+    }
+
+    #[test]
+    fn test_generic_internal_linear_layer_16_matches_matmul_internal() {
+        let mut state_generic = [
+            F::from_u64(1),
+            F::from_u64(2),
+            F::from_u64(3),
+            F::from_u64(4),
+            F::from_u64(5),
+            F::from_u64(6),
+            F::from_u64(7),
+            F::from_u64(8),
+            F::from_u64(9),
+            F::from_u64(10),
+            F::from_u64(11),
+            F::from_u64(12),
+            F::from_u64(13),
+            F::from_u64(14),
+            F::from_u64(15),
+            F::from_u64(16),
+        ];
+        let mut state_existing = state_generic;
+
+        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
+        matmul_internal(&mut state_existing, MATRIX_DIAG_16_GOLDILOCKS);
+
+        assert_eq!(state_generic, state_existing);
+    }
+
+    #[test]
+    fn test_generic_internal_linear_layer_20_matches_matmul_internal() {
+        let mut state_generic = [
+            F::from_u64(1),
+            F::from_u64(2),
+            F::from_u64(3),
+            F::from_u64(4),
+            F::from_u64(5),
+            F::from_u64(6),
+            F::from_u64(7),
+            F::from_u64(8),
+            F::from_u64(9),
+            F::from_u64(10),
+            F::from_u64(11),
+            F::from_u64(12),
+            F::from_u64(13),
+            F::from_u64(14),
+            F::from_u64(15),
+            F::from_u64(16),
+            F::from_u64(17),
+            F::from_u64(18),
+            F::from_u64(19),
+            F::from_u64(20),
+        ];
+        let mut state_existing = state_generic;
+
+        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
+        matmul_internal(&mut state_existing, MATRIX_DIAG_20_GOLDILOCKS);
+
+        assert_eq!(state_generic, state_existing);
+    }
+
+    #[test]
+    fn test_default_goldilocks_poseidon2_width_8() {
+        let mut input: [F; 8] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
+
+        let expected: [F; 8] = Goldilocks::new_array([
+            0x020cf04a1b214d14,
+            0x84e14aaaeacaed25,
+            0x1ae0f640e81c7457,
+            0xa4d204cbaeb0d8a5,
+            0x0cf637b627b3a7ff,
+            0x788d304d948b486b,
+            0x7327133ea1949af4,
+            0xf415abb924da395b,
+        ]);
+
+        let perm = default_goldilocks_poseidon2_8();
+        perm.permute_mut(&mut input);
+
+        assert_eq!(input, expected);
+    }
+
+    #[test]
+    fn test_default_goldilocks_poseidon2_width_12() {
+        let mut input: [F; 12] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
+
+        let expected: [F; 12] = Goldilocks::new_array([
+            0xf292ab67c0f14b03,
+            0x0a32f1b37656544c,
+            0x053c61ab895498de,
+            0x02ff92e55b196ffb,
+            0x58176e8f6f58cab2,
+            0xb0aa1206e7aec0f8,
+            0xe90c13f3dce83ca4,
+            0xf4da15333edf39c2,
+            0x23b701c053c2ca6c,
+            0xd233d593dcdfbf58,
+            0x4effa5f9516fb52e,
+            0x0aaf4489f1f40166,
+        ]);
+
+        let perm = default_goldilocks_poseidon2_12();
+        perm.permute_mut(&mut input);
+
+        assert_eq!(input, expected);
+    }
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs
new file mode 100644
index 000000000..44fe4fa3f
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs
@@ -0,0 +1,86 @@
+use p3_mds::MdsPermutation;
+use p3_mds::util::apply_circulant;
+use p3_symmetric::Permutation;
+
+use crate::x86_64_avx2::packing::PackedGoldilocksAVX2;
+use crate::{
+    MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW,
+    MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks,
+};
+const fn convert_array<const N: usize>(arr: [i64; N]) -> [u64; N] {
+    let mut result: [u64; N] = [0; N];
+    let mut i = 0;
+    while i < N {
+        result[i] = arr[i] as u64;
+        i += 1;
+    }
+    result
+}
+
+impl Permutation<[PackedGoldilocksAVX2; 8]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX2; 8]) -> [PackedGoldilocksAVX2; 8] {
+        const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW);
+        apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX2, 8> for MdsMatrixGoldilocks {}
+
+impl Permutation<[PackedGoldilocksAVX2; 12]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX2; 12]) -> [PackedGoldilocksAVX2; 12] {
+        const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW);
+        apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX2, 12> for MdsMatrixGoldilocks {}
+
+impl Permutation<[PackedGoldilocksAVX2; 16]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX2; 16]) -> [PackedGoldilocksAVX2; 16] {
+        const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW);
+        apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX2, 16> for MdsMatrixGoldilocks {}
+
+impl Permutation<[PackedGoldilocksAVX2; 24]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX2; 24]) -> [PackedGoldilocksAVX2; 24] {
+        apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX2, 24> for MdsMatrixGoldilocks {}
+
+#[cfg(test)]
+mod tests {
+    use p3_symmetric::Permutation;
+    use rand::rngs::SmallRng;
+    use rand::{RngExt, SeedableRng};
+
+    use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX2};
+
+    macro_rules! test_avx2_mds {
+        ($name:ident, $width:literal) => {
+            #[test]
+            fn $name() {
+                let mut rng = SmallRng::seed_from_u64(1);
+                let mds = MdsMatrixGoldilocks;
+
+                let input: [Goldilocks; $width] = rng.random();
+                let expected = mds.permute(input);
+
+                let packed_input = input.map(Into::<PackedGoldilocksAVX2>::into);
+                let packed_output = mds.permute(packed_input);
+
+                let avx2_output = packed_output.map(|x| x.0[0]);
+                assert_eq!(avx2_output, expected);
+            }
+        };
+    }
+
+    test_avx2_mds!(test_avx2_mds_width_8, 8);
+    test_avx2_mds!(test_avx2_mds_width_12, 12);
+    test_avx2_mds!(test_avx2_mds_width_16, 16);
+    test_avx2_mds!(test_avx2_mds_width_24, 24);
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs
new file mode 100644
index 000000000..09300a20f
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs
@@ -0,0 +1,3 @@
+mod mds;
+mod packing;
+pub use packing::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs
new file mode 100644
index 000000000..217a2b2e0
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs
@@ -0,0 +1,539 @@
+use alloc::vec::Vec;
+use core::arch::x86_64::*;
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use p3_field::exponentiation::exp_10540996611094048183;
+use p3_field::interleave::{interleave_u64, interleave_u128};
+use p3_field::op_assign_macros::{
+    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods,
+    impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field,
+    ring_sum,
+};
+use p3_field::{
+    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue,
+    PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2,
+};
+use p3_util::reconstitute_from_base;
+use rand::distr::{Distribution, StandardUniform};
+use rand::{Rng, RngExt};
+
+use crate::{Goldilocks, P};
+
+const WIDTH: usize = 4;
+
+/// Vectorized AVX2 implementation of `Goldilocks` arithmetic.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+#[repr(transparent)] // Needed to make `transmute`s safe.
+#[must_use]
+pub struct PackedGoldilocksAVX2(pub [Goldilocks; WIDTH]);
+
+impl PackedGoldilocksAVX2 {
+    /// Get an arch-specific vector representing the packed values.
+    #[inline]
+    #[must_use]
+    pub(crate) fn to_vector(self) -> __m256i {
+        unsafe {
+            // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It
+            // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be
+            // transmuted to `__m256i`, since arrays are guaranteed to be contiguous in memory.
+            // Finally `PackedGoldilocksAVX2` is `repr(transparent)` so it can be transmuted to
+            // `[Goldilocks; WIDTH]`.
+            transmute(self)
+        }
+    }
+
+    /// Make a packed field vector from an arch-specific vector.
+    ///
+    /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function
+    /// is safe unlike the `Mersenne31/MontyField31` variants.
+    #[inline]
+    pub(crate) fn from_vector(vector: __m256i) -> Self {
+        unsafe {
+            // Safety: `__m256i` can be transmuted to `[u64; WIDTH]` (since arrays elements are
+            // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since
+            // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to
+            // `PackedGoldilocksAVX2` (since `PackedGoldilocksAVX2` is also `repr(transparent)`).
+            transmute(vector)
+        }
+    }
+
+    /// Copy `value` to all positions in a packed vector. This is the same as
+    /// `From<Goldilocks>::from`, but `const`.
+    #[inline]
+    const fn broadcast(value: Goldilocks) -> Self {
+        Self([value; WIDTH])
+    }
+}
+
+impl From<Goldilocks> for PackedGoldilocksAVX2 {
+    fn from(x: Goldilocks) -> Self {
+        Self::broadcast(x)
+    }
+}
+
+impl Add for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Sub for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Neg for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::from_vector(neg(self.to_vector()))
+    }
+}
+
+impl Mul for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl_add_assign!(PackedGoldilocksAVX2);
+impl_sub_assign!(PackedGoldilocksAVX2);
+impl_mul_methods!(PackedGoldilocksAVX2);
+ring_sum!(PackedGoldilocksAVX2);
+impl_rng!(PackedGoldilocksAVX2);
+
+impl PrimeCharacteristicRing for PackedGoldilocksAVX2 {
+    type PrimeSubfield = Goldilocks;
+
+    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
+    const ONE: Self = Self::broadcast(Goldilocks::ONE);
+    const TWO: Self = Self::broadcast(Goldilocks::TWO);
+    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f.into()
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::from_vector(halve(self.to_vector()))
+    }
+
+    #[inline]
+    fn square(&self) -> Self {
+        Self::from_vector(square(self.to_vector()))
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY: this is a repr(transparent) wrapper around an array.
+        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
+    }
+}
+
+// Degree of the smallest permutation polynomial for Goldilocks.
+//
+// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
+impl InjectiveMonomial<7> for PackedGoldilocksAVX2 {}
+
+impl PermutationMonomial<7> for PackedGoldilocksAVX2 {
+    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
+    ///
+    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl_add_base_field!(PackedGoldilocksAVX2, Goldilocks);
+impl_sub_base_field!(PackedGoldilocksAVX2, Goldilocks);
+impl_mul_base_field!(PackedGoldilocksAVX2, Goldilocks);
+impl_div_methods!(PackedGoldilocksAVX2, Goldilocks);
+impl_sum_prod_base_field!(PackedGoldilocksAVX2, Goldilocks);
+
+impl Algebra<Goldilocks> for PackedGoldilocksAVX2 {
+    // Benchmarked on AVX2: chunk=32 ≈ 226ns, chunk=2 ≈ 228ns, chunk=16 ≈ 229ns.
+    const BATCHED_LC_CHUNK: usize = 32;
+}
+
+impl_packed_value!(PackedGoldilocksAVX2, Goldilocks, WIDTH);
+
+unsafe impl PackedField for PackedGoldilocksAVX2 {
+    type Scalar = Goldilocks;
+}
+
+impl_packed_field_pow_2!(
+    PackedGoldilocksAVX2;
+    [
+        (1, interleave_u64),
+        (2, interleave_u128),
+    ],
+    WIDTH
+);
+
+// Resources:
+// 1. Intel Intrinsics Guide for explanation of each intrinsic:
+//    https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html
+// 3. Intel optimization manual for introduction to x86 vector extensions and best practices:
+//    https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html
+
+// Preliminary knowledge:
+// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with
+//    _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a
+//    vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper.
+//
+// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily
+//    emulated. The method recognizes that for a + b overflowed iff (a + b) < a:
+//        i. res_lo = a_lo + b_lo
+//       ii. carry_mask = res_lo < a_lo
+//      iii. res_hi = a_hi + b_hi - carry_mask
+//    Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions
+//    return -1 (all bits 1) for true and 0 for false.
+//
+// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons
+//    by recognizing that a <u b iff a + (1 << 63) <s b + (1 << 63), where the addition wraps around
+//    and the comparisons are unsigned and signed respectively. The shift function adds/subtracts
+//    1 << 63 to enable this trick.
+//      Example: addition with carry.
+//        i. a_lo_s = shift(a_lo)
+//       ii. res_lo_s = a_lo_s + b_lo
+//      iii. carry_mask = res_lo_s <s a_lo_s
+//       iv. res_lo = shift(res_lo_s)
+//        v. res_hi = a_hi + b_hi - carry_mask
+//    The suffix _s denotes a value that has been shifted by 1 << 63. The result of addition is
+//    shifted if exactly one of the operands is shifted, as is the case on line ii. Line iii.
+//    performs a signed comparison res_lo_s <s a_lo_s on shifted values to emulate unsigned
+//    comparison res_lo <u a_lo on unshifted values. Finally, line iv. reverses the shift so the
+//    result can be returned.
+//      When performing a chain of calculations, we can often save instructions by letting the shift
+//    propagate through and only undoing it when necessary. For example, to compute the addition of
+//    three two-word (128-bit) numbers we can do:
+//        i. a_lo_s = shift(a_lo)
+//       ii. tmp_lo_s = a_lo_s + b_lo
+//      iii. tmp_carry_mask = tmp_lo_s <s a_lo_s
+//       iv. tmp_hi = a_hi + b_hi - tmp_carry_mask
+//        v. res_lo_s = tmp_lo_s + c_lo
+//       vi. res_carry_mask = res_lo_s <s tmp_lo_s
+//      vii. res_lo = shift(res_lo_s)
+//     viii. res_hi = tmp_hi + c_hi - res_carry_mask
+//    Notice that the above 3-value addition still only requires two calls to shift, just like our
+//    2-value addition.
+
+const SIGN_BIT: __m256i = unsafe { transmute([i64::MIN; WIDTH]) };
+const SHIFTED_FIELD_ORDER: __m256i =
+    unsafe { transmute([Goldilocks::ORDER_U64 ^ (i64::MIN as u64); WIDTH]) };
+
+/// Equal to 2^32 - 1 = 2^64 mod P.
+const EPSILON: __m256i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) };
+
+/// Add 2^63 with overflow. Needed to emulate unsigned comparisons (see point 3. in
+/// packed_prime_field.rs).
+#[inline]
+pub fn shift(x: __m256i) -> __m256i {
+    unsafe { _mm256_xor_si256(x, SIGN_BIT) }
+}
+
+/// Convert to canonical representation.
+/// The argument is assumed to be shifted by 1 << 63 (i.e. x_s = x + 1<<63, where x is the field
+///   value). The returned value is similarly shifted by 1 << 63 (i.e. we return y_s = y + (1<<63),
+///   where 0 <= y < FIELD_ORDER).
+#[inline]
+unsafe fn canonicalize_s(x_s: __m256i) -> __m256i {
+    unsafe {
+        // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1.
+        let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s);
+        // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0.
+        let wrapback_amt = _mm256_andnot_si256(mask, EPSILON);
+        _mm256_add_epi64(x_s, wrapback_amt)
+    }
+}
+
+/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is
+/// pre-shifted by 1 << 63. The result is similarly shifted.
+#[inline]
+unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i {
+    unsafe {
+        let res_wrapped_s = _mm256_add_epi64(x, y_s);
+        let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0.
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
+        _mm256_add_epi64(res_wrapped_s, wrapback_amt)
+    }
+}
+
+/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`.
+///
+/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn add(x: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let y_s = shift(y);
+        let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s));
+        shift(res_s)
+    }
+}
+
+/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`.
+///
+/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn sub(x: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let mut y_s = shift(y);
+        y_s = canonicalize_s(y_s);
+        let x_s = shift(x);
+        let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0.
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0.
+        let res_wrapped = _mm256_sub_epi64(x_s, y_s);
+        _mm256_sub_epi64(res_wrapped, wrapback_amt)
+    }
+}
+
+/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`.
+///
+/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn neg(y: __m256i) -> __m256i {
+    unsafe {
+        let y_s = shift(y);
+        _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s))
+    }
+}
+
+/// Halve a vector of Goldilocks field elements.
+#[inline(always)]
+pub(crate) fn halve(input: __m256i) -> __m256i {
+    /*
+        We want this to compile to:
+            vpand    least_bit, val, ONE
+            vpsrlq   t, val, 1
+            vpsubq   neg_least_bit, ZERO, least_bit
+            vpand    maybe_half, HALF, neg_least_bit
+            vpaddq   res, t, maybe_half
+        throughput: 1.67 cyc/vec
+        latency: 4 cyc
+
+        Given an element val in [0, P), we want to compute val/2 mod P.
+        If val is even: val/2 mod P = val/2 = val >> 1.
+        If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2
+    */
+    unsafe {
+        // Safety: If this code got compiled then AVX2 intrinsics are available.
+        const ONE: __m256i = unsafe { transmute([1_i64; 4]) };
+        const ZERO: __m256i = unsafe { transmute([0_i64; 4]) };
+        let half = _mm256_set1_epi64x(P.div_ceil(2) as i64); // Compiler should realise this is constant.
+
+        let least_bit = _mm256_and_si256(input, ONE); // Determine the parity of val.
+        let t = _mm256_srli_epi64::<1>(input);
+
+        // Negate the least bit giving us either 0 (all bits 0) or -1 (all bits 1).
+        // It would be better to use vpsignq but this instruction does not exist.
+        let neg_least_bit = _mm256_sub_epi64(ZERO, least_bit);
+
+        // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0
+        let maybe_half = _mm256_and_si256(half, neg_least_bit);
+        _mm256_add_epi64(t, maybe_half)
+    }
+}
+
+/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the
+/// scalar instruction, but may be worth it if we want our data to live in vector registers.
+#[inline]
+fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) {
+    unsafe {
+        // We want to move the high 32 bits to the low position. The multiplication instruction ignores
+        // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can
+        // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication.
+        //   This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the
+        // distinction; the casts are free and it guarantees that the exact bit pattern is preserved.
+        // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency
+        // since Haswell.
+        let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
+        let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y)));
+
+        // All four pairwise multiplications
+        let mul_ll = _mm256_mul_epu32(x, y);
+        let mul_lh = _mm256_mul_epu32(x, y_hi);
+        let mul_hl = _mm256_mul_epu32(x_hi, y);
+        let mul_hh = _mm256_mul_epu32(x_hi, y_hi);
+
+        // Bignum addition
+        // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow.
+        let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll);
+        let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi);
+        // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow.
+        // Also, extract high 32 bits of t0 and add to mul_hh.
+        let t0_lo = _mm256_and_si256(t0, EPSILON);
+        let t0_hi = _mm256_srli_epi64::<32>(t0);
+        let t1 = _mm256_add_epi64(mul_lh, t0_lo);
+        let t2 = _mm256_add_epi64(mul_hh, t0_hi);
+        // Lastly, extract the high 32 bits of t1 and add to t2.
+        let t1_hi = _mm256_srli_epi64::<32>(t1);
+        let res_hi = _mm256_add_epi64(t2, t1_hi);
+
+        // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high
+        // position).
+        let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1)));
+        let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction.
+#[inline]
+fn square64(x: __m256i) -> (__m256i, __m256i) {
+    unsafe {
+        // Get high 32 bits of x. See comment in mul64_64_s.
+        let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
+
+        // All pairwise multiplications.
+        let mul_ll = _mm256_mul_epu32(x, x);
+        let mul_lh = _mm256_mul_epu32(x, x_hi);
+        let mul_hh = _mm256_mul_epu32(x_hi, x_hi);
+
+        // Bignum addition, but mul_lh is shifted by 33 bits (not 32).
+        let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll);
+        let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi);
+        let t0_hi = _mm256_srli_epi64::<31>(t0);
+        let res_hi = _mm256_add_epi64(mul_hh, t0_hi);
+
+        // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high
+        // position).
+        let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh);
+        let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Goldilocks addition of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be
+/// `<= 2^64 - 2^32 = 0xffffffff00000000`. The result is shifted by 2**63.
+#[inline]
+unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let res_wrapped_s = _mm256_add_epi64(x_s, y);
+        // 32-bit compare is faster than 64-bit. It's safe as long as x > res_wrapped iff x >> 32 >
+        // res_wrapped >> 32. The case of x >> 32 > res_wrapped >> 32 is trivial and so is <. The case
+        // where x >> 32 = res_wrapped >> 32 remains. If x >> 32 = res_wrapped >> 32, then y >> 32 =
+        // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y
+        // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur.
+        let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0.
+        // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise.
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
+        _mm256_add_epi64(res_wrapped_s, wrapback_amt)
+    }
+}
+
+/// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be
+/// <= `0xffffffff00000000`. The result is shifted by 2**63.
+#[inline]
+unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let res_wrapped_s = _mm256_sub_epi64(x_s, y);
+        // 32-bit compare is faster than 64-bit. It's safe as long as res_wrapped > x iff res_wrapped >>
+        // 32 > x >> 32. The case of res_wrapped >> 32 > x >> 32 is trivial and so is <. The case where
+        // res_wrapped >> 32 = x >> 32 remains. If res_wrapped >> 32 = x >> 32, then y >> 32 =
+        // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if
+        // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur.
+        let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0.
+        // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise.
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0.
+        _mm256_sub_epi64(res_wrapped_s, wrapback_amt)
+    }
+}
+
+/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order.
+///
+/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`.
+#[inline]
+fn reduce128(x: (__m256i, __m256i)) -> __m256i {
+    unsafe {
+        let (hi0, lo0) = x;
+
+        // First we shift lo0 to lo0_s = lo0 + 2^{63} mod 2^64
+        // This lets us emulate unsigned comparisons
+        let lo0_s = shift(lo0);
+
+        // Get the top 32 bits of hi_hi0.
+        let hi_hi0 = _mm256_srli_epi64::<32>(hi0);
+
+        // Computes lo0_s - hi_hi0 mod FIELD_ORDER.
+        // Makes sense to do as 2^96 = -1 mod FIELD_ORDER.
+        // sub_small_64s_64_s is safe to use as `hi_hi0 < 2^32`.
+        let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0);
+
+        // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER
+        // _mm256_mul_epu32 ignores the top 32 bits so just use that.
+        let t1 = _mm256_mul_epu32(hi0, EPSILON);
+
+        // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 so we can use `add_small_64s_64_s` to get
+        // `lo2_s = lo1_s + t1 mod FIELD_ORDER.`
+        let lo2_s = add_small_64s_64_s(lo1_s, t1);
+
+        // Finally just need to correct for the shift.
+        shift(lo2_s)
+    }
+}
+
+/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`.
+///
+/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn mul(x: __m256i, y: __m256i) -> __m256i {
+    reduce128(mul64_64(x, y))
+}
+
+/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`.
+///
+/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn square(x: __m256i) -> __m256i {
+    reduce128(square64(x))
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field_testing::test_packed_field;
+
+    use super::{Goldilocks, PackedGoldilocksAVX2, WIDTH};
+
+    const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([
+        0xFFFF_FFFF_0000_0000,
+        0xFFFF_FFFF_FFFF_FFFF,
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0001,
+    ]);
+
+    const ZEROS: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([
+        0x0000_0000_0000_0000,
+        0xFFFF_FFFF_0000_0001,
+        0x0000_0000_0000_0000,
+        0xFFFF_FFFF_0000_0001,
+    ]));
+
+    const ONES: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0002,
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0002,
+    ]));
+
+    test_packed_field!(
+        crate::PackedGoldilocksAVX2,
+        &[super::ZEROS],
+        &[super::ONES],
+        crate::PackedGoldilocksAVX2(super::SPECIAL_VALS)
+    );
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs
new file mode 100644
index 000000000..f4d6c9f71
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs
@@ -0,0 +1,86 @@
+use p3_mds::MdsPermutation;
+use p3_mds::util::apply_circulant;
+use p3_symmetric::Permutation;
+
+use crate::x86_64_avx512::packing::PackedGoldilocksAVX512;
+use crate::{
+    MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW,
+    MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks,
+};
+const fn convert_array<const N: usize>(arr: [i64; N]) -> [u64; N] {
+    let mut result: [u64; N] = [0; N];
+    let mut i = 0;
+    while i < N {
+        result[i] = arr[i] as u64;
+        i += 1;
+    }
+    result
+}
+
+impl Permutation<[PackedGoldilocksAVX512; 8]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX512; 8]) -> [PackedGoldilocksAVX512; 8] {
+        const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW);
+        apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX512, 8> for MdsMatrixGoldilocks {}
+
+impl Permutation<[PackedGoldilocksAVX512; 12]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX512; 12]) -> [PackedGoldilocksAVX512; 12] {
+        const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW);
+        apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX512, 12> for MdsMatrixGoldilocks {}
+
+impl Permutation<[PackedGoldilocksAVX512; 16]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX512; 16]) -> [PackedGoldilocksAVX512; 16] {
+        const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW);
+        apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX512, 16> for MdsMatrixGoldilocks {}
+
+impl Permutation<[PackedGoldilocksAVX512; 24]> for MdsMatrixGoldilocks {
+    fn permute(&self, input: [PackedGoldilocksAVX512; 24]) -> [PackedGoldilocksAVX512; 24] {
+        apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input)
+    }
+}
+
+impl MdsPermutation<PackedGoldilocksAVX512, 24> for MdsMatrixGoldilocks {}
+
+#[cfg(test)]
+mod tests {
+    use p3_symmetric::Permutation;
+    use rand::rngs::SmallRng;
+    use rand::{RngExt, SeedableRng};
+
+    use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX512};
+
+    macro_rules! test_avx512_mds {
+        ($name:ident, $width:literal) => {
+            #[test]
+            fn $name() {
+                let mut rng = SmallRng::seed_from_u64(1);
+                let mds = MdsMatrixGoldilocks;
+
+                let input: [Goldilocks; $width] = rng.random();
+                let expected = mds.permute(input);
+
+                let packed_input = input.map(Into::<PackedGoldilocksAVX512>::into);
+                let packed_output = mds.permute(packed_input);
+
+                let avx512_output = packed_output.map(|x| x.0[0]);
+                assert_eq!(avx512_output, expected);
+            }
+        };
+    }
+
+    test_avx512_mds!(test_avx512_mds_width_8, 8);
+    test_avx512_mds!(test_avx512_mds_width_12, 12);
+    test_avx512_mds!(test_avx512_mds_width_16, 16);
+    test_avx512_mds!(test_avx512_mds_width_24, 24);
+}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs
new file mode 100644
index 000000000..09300a20f
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs
@@ -0,0 +1,3 @@
+mod mds;
+mod packing;
+pub use packing::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs
new file mode 100644
index 000000000..0c751b436
--- /dev/null
+++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs
@@ -0,0 +1,444 @@
+use alloc::vec::Vec;
+use core::arch::x86_64::*;
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use p3_field::exponentiation::exp_10540996611094048183;
+use p3_field::interleave::{interleave_u64, interleave_u128, interleave_u256};
+use p3_field::op_assign_macros::{
+    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods,
+    impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field,
+    ring_sum,
+};
+use p3_field::{
+    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue,
+    PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2,
+};
+use p3_util::reconstitute_from_base;
+use rand::distr::{Distribution, StandardUniform};
+use rand::{Rng, RngExt};
+
+use crate::{Goldilocks, P};
+
+const WIDTH: usize = 8;
+
+/// Vectorized AVX512 implementation of `Goldilocks` arithmetic.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+#[repr(transparent)] // Needed to make `transmute`s safe.
+#[must_use]
+pub struct PackedGoldilocksAVX512(pub [Goldilocks; WIDTH]);
+
+impl PackedGoldilocksAVX512 {
+    /// Get an arch-specific vector representing the packed values.
+    #[inline]
+    #[must_use]
+    pub(crate) fn to_vector(self) -> __m512i {
+        unsafe {
+            // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It
+            // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be
+            // transmuted to `__m512i`, since arrays are guaranteed to be contiguous in memory.
+            // Finally `PackedGoldilocksAVX512` is `repr(transparent)` so it can be transmuted to
+            // `[Goldilocks; WIDTH]`.
+            transmute(self)
+        }
+    }
+
+    /// Make a packed field vector from an arch-specific vector.
+    ///
+    /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function
+    /// is safe unlike the `Mersenne31/MontyField31` variants.
+    #[inline]
+    pub(crate) fn from_vector(vector: __m512i) -> Self {
+        unsafe {
+            // Safety: `__m512i` can be transmuted to `[u64; WIDTH]` (since arrays elements are
+            // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since
+            // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to
+            // `PackedGoldilocksAVX512` (since `PackedGoldilocksAVX512` is also `repr(transparent)`).
+            transmute(vector)
+        }
+    }
+
+    /// Copy `value` to all positions in a packed vector. This is the same as
+    /// `From<Goldilocks>::from`, but `const`.
+    #[inline]
+    const fn broadcast(value: Goldilocks) -> Self {
+        Self([value; WIDTH])
+    }
+}
+
+impl From<Goldilocks> for PackedGoldilocksAVX512 {
+    fn from(x: Goldilocks) -> Self {
+        Self::broadcast(x)
+    }
+}
+
+impl Add for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Sub for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Neg for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::from_vector(neg(self.to_vector()))
+    }
+}
+
+impl Mul for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl_add_assign!(PackedGoldilocksAVX512);
+impl_sub_assign!(PackedGoldilocksAVX512);
+impl_mul_methods!(PackedGoldilocksAVX512);
+ring_sum!(PackedGoldilocksAVX512);
+impl_rng!(PackedGoldilocksAVX512);
+
+impl PrimeCharacteristicRing for PackedGoldilocksAVX512 {
+    type PrimeSubfield = Goldilocks;
+
+    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
+    const ONE: Self = Self::broadcast(Goldilocks::ONE);
+    const TWO: Self = Self::broadcast(Goldilocks::TWO);
+    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f.into()
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::from_vector(halve(self.to_vector()))
+    }
+
+    #[inline]
+    fn square(&self) -> Self {
+        Self::from_vector(square(self.to_vector()))
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY: this is a repr(transparent) wrapper around an array.
+        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
+    }
+}
+
+impl_add_base_field!(PackedGoldilocksAVX512, Goldilocks);
+impl_sub_base_field!(PackedGoldilocksAVX512, Goldilocks);
+impl_mul_base_field!(PackedGoldilocksAVX512, Goldilocks);
+impl_div_methods!(PackedGoldilocksAVX512, Goldilocks);
+impl_sum_prod_base_field!(PackedGoldilocksAVX512, Goldilocks);
+
+impl Algebra<Goldilocks> for PackedGoldilocksAVX512 {
+    // Benchmarked on AVX-512: chunk=4 ≈ 198ns, chunk=2 ≈ 198ns, chunk=32 ≈ 199ns.
+    const BATCHED_LC_CHUNK: usize = 4;
+}
+
+// Degree of the smallest permutation polynomial for Goldilocks.
+//
+// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
+impl InjectiveMonomial<7> for PackedGoldilocksAVX512 {}
+
+impl PermutationMonomial<7> for PackedGoldilocksAVX512 {
+    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
+    ///
+    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl_packed_value!(PackedGoldilocksAVX512, Goldilocks, WIDTH);
+
+unsafe impl PackedField for PackedGoldilocksAVX512 {
+    type Scalar = Goldilocks;
+}
+
+impl_packed_field_pow_2!(
+    PackedGoldilocksAVX512;
+    [
+        (1, interleave_u64),
+        (2, interleave_u128),
+        (4, interleave_u256),
+    ],
+    WIDTH
+);
+
+const FIELD_ORDER: __m512i = unsafe { transmute([Goldilocks::ORDER_U64; WIDTH]) };
+const EPSILON: __m512i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) };
+
+#[inline]
+unsafe fn canonicalize(x: __m512i) -> __m512i {
+    unsafe {
+        let mask = _mm512_cmpge_epu64_mask(x, FIELD_ORDER);
+        _mm512_mask_sub_epi64(x, mask, x, FIELD_ORDER)
+    }
+}
+
+/// Compute the modular addition `x + y mod FIELD_ORDER`.
+///
+/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider
+/// set of circumstances if bounds on `x` are known.
+///
+/// The result will be a u64 which may be greater than FIELD_ORDER.
+///
+/// Safety:
+///     User must ensure that x + y < 2^64 + FIELD_ORDER.
+#[inline]
+unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    unsafe {
+        let res_wrapped = _mm512_add_epi64(x, y);
+        let mask = _mm512_cmplt_epu64_mask(res_wrapped, y); // mask set if add overflowed
+        _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER)
+    }
+}
+
+/// Compute the modular subtraction x - y mod FIELD_ORDER.
+///
+/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider
+/// set of circumstances if bounds on `x` are known.
+///
+/// The result will be a u64 which may be greater than FIELD_ORDER.
+///
+/// Safety:
+///     User must ensure that x - y > -FIELD_ORDER.
+#[inline]
+unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    unsafe {
+        let mask = _mm512_cmplt_epu64_mask(x, y); // mask set if sub will underflow (x < y)
+        let res_wrapped = _mm512_sub_epi64(x, y);
+        _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER)
+    }
+}
+
+/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`.
+///
+/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn add(x: __m512i, y: __m512i) -> __m512i {
+    unsafe { add_no_double_overflow_64_64(x, canonicalize(y)) }
+}
+
+/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`.
+///
+/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn sub(x: __m512i, y: __m512i) -> __m512i {
+    unsafe { sub_no_double_overflow_64_64(x, canonicalize(y)) }
+}
+
+/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`.
+///
+/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn neg(y: __m512i) -> __m512i {
+    unsafe { _mm512_sub_epi64(FIELD_ORDER, canonicalize(y)) }
+}
+
+/// Halve a vector of Goldilocks field elements.
+#[inline(always)]
+pub(crate) fn halve(input: __m512i) -> __m512i {
+    /*
+        We want this to compile to:
+            vptestmq  least_bit, val, ONE
+            vpsrlq    res, val, 1
+            vpaddq    res{least_bit}, res, maybe_half
+        throughput: 2 cyc/vec
+        latency: 4 cyc
+
+        Given an element val in [0, P), we want to compute val/2 mod P.
+        If val is even: val/2 mod P = val/2 = val >> 1.
+        If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2
+    */
+    unsafe {
+        // Safety: If this code got compiled then AVX512 intrinsics are available.
+        const ONE: __m512i = unsafe { transmute([1_i64; 8]) };
+        let half = _mm512_set1_epi64(P.div_ceil(2) as i64); // Compiler realises this is constant.
+
+        let least_bit = _mm512_test_epi64_mask(input, ONE); // Determine the parity of val.
+        let t = _mm512_srli_epi64::<1>(input);
+        // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0
+        _mm512_mask_add_epi64(t, least_bit, t, half)
+    }
+}
+
+#[allow(clippy::useless_transmute)]
+const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) };
+
+/// Full 64-bit by 64-bit multiplication.
+#[inline]
+fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    unsafe {
+        // We want to move the high 32 bits to the low position. The multiplication instruction ignores
+        // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can
+        // be done on port 5; bitshifts run on port 0, competing with multiplication.
+        //   This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the
+        // distinction; the casts are free and it guarantees that the exact bit pattern is preserved.
+        // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency
+        // since Haswell.
+        let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+        let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y)));
+
+        // All four pairwise multiplications
+        let mul_ll = _mm512_mul_epu32(x, y);
+        let mul_lh = _mm512_mul_epu32(x, y_hi);
+        let mul_hl = _mm512_mul_epu32(x_hi, y);
+        let mul_hh = _mm512_mul_epu32(x_hi, y_hi);
+
+        // Bignum addition
+        // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow.
+        let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll);
+        let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi);
+        // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow.
+        // Also, extract high 32 bits of t0 and add to mul_hh.
+        let t0_lo = _mm512_and_si512(t0, EPSILON);
+        let t0_hi = _mm512_srli_epi64::<32>(t0);
+        let t1 = _mm512_add_epi64(mul_lh, t0_lo);
+        let t2 = _mm512_add_epi64(mul_hh, t0_hi);
+        // Lastly, extract the high 32 bits of t1 and add to t2.
+        let t1_hi = _mm512_srli_epi64::<32>(t1);
+        let res_hi = _mm512_add_epi64(t2, t1_hi);
+
+        // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high
+        // position).
+        let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1)));
+        let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Full 64-bit squaring.
+#[inline]
+fn square64(x: __m512i) -> (__m512i, __m512i) {
+    unsafe {
+        // Get high 32 bits of x. See comment in mul64_64_s.
+        let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+
+        // All pairwise multiplications.
+        let mul_ll = _mm512_mul_epu32(x, x);
+        let mul_lh = _mm512_mul_epu32(x, x_hi);
+        let mul_hh = _mm512_mul_epu32(x_hi, x_hi);
+
+        // Bignum addition, but mul_lh is shifted by 33 bits (not 32).
+        let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll);
+        let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi);
+        let t0_hi = _mm512_srli_epi64::<31>(t0);
+        let res_hi = _mm512_add_epi64(mul_hh, t0_hi);
+
+        // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high
+        // position).
+        let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh);
+        let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order.
+///
+/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`.
+#[inline]
+fn reduce128(x: (__m512i, __m512i)) -> __m512i {
+    unsafe {
+        let (hi0, lo0) = x;
+
+        // Find the high 32 bits of hi0.
+        let hi_hi0 = _mm512_srli_epi64::<32>(hi0);
+
+        // Computes lo0_s - hi_hi0 mod FIELD_ORDER.
+        // Makes sense to do as 2^96 = -1 mod FIELD_ORDER.
+        // `sub_no_double_overflow_64_64` is safe to use as `hi_hi0 < 2^32`.
+        let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0);
+
+        // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER
+        // _mm256_mul_epu32 ignores the top 32 bits so just use that.
+        let t1 = _mm512_mul_epu32(hi0, EPSILON);
+
+        // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 < FIELD_ORDER so we can use `add_no_double_overflow_64_64` to get
+        // `lo1 + t1 mod FIELD_ORDER.`
+        add_no_double_overflow_64_64(lo1, t1)
+    }
+}
+
+/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`.
+///
+/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn mul(x: __m512i, y: __m512i) -> __m512i {
+    reduce128(mul64_64(x, y))
+}
+
+/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`.
+///
+/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
+#[inline]
+fn square(x: __m512i) -> __m512i {
+    reduce128(square64(x))
+}
+
+#[cfg(test)]
+mod tests {
+    use p3_field_testing::test_packed_field;
+
+    use super::{Goldilocks, PackedGoldilocksAVX512, WIDTH};
+
+    const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([
+        0xFFFF_FFFF_0000_0001,
+        0xFFFF_FFFF_0000_0000,
+        0xFFFF_FFFE_FFFF_FFFF,
+        0xFFFF_FFFF_FFFF_FFFF,
+        0x0000_0000_0000_0000,
+        0x0000_0000_0000_0001,
+        0x0000_0000_0000_0002,
+        0x0FFF_FFFF_F000_0000,
+    ]);
+
+    const ZEROS: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([
+        0x0000_0000_0000_0000,
+        0xFFFF_FFFF_0000_0001,
+        0x0000_0000_0000_0000,
+        0xFFFF_FFFF_0000_0001,
+        0x0000_0000_0000_0000,
+        0xFFFF_FFFF_0000_0001,
+        0x0000_0000_0000_0000,
+        0xFFFF_FFFF_0000_0001,
+    ]));
+
+    const ONES: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0002,
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0002,
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0002,
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0002,
+    ]));
+
+    test_packed_field!(
+        crate::PackedGoldilocksAVX512,
+        &[super::ZEROS],
+        &[super::ONES],
+        crate::PackedGoldilocksAVX512(super::SPECIAL_VALS)
+    );
+}
diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
new file mode 100755
index 000000000..01e3a5306
--- /dev/null
+++ b/bench_vs_plonky3/run.sh
@@ -0,0 +1,410 @@
+#!/bin/bash
+# Benchmark: Lambda STARK vs Plonky3 — single-shot prove time on the shared
+# Fibonacci AIR (columns = 2 * num_sequences, blowup = 2, fri_queries = 219).
+#
+# Usage:
+#   ./bench_vs_plonky3/run.sh [--log-rows K ...] [--num-sequences N] [--runs N]
+#                             [--lambda-only | --p3-only] [--report-dir DIR]
+#                             [--no-p3-patch] [--scalar] [--no-color]
+#
+# Defaults: --log-rows 19, --num-sequences 16, --runs 3.
+# With multiple --log-rows values, prints one median row per size.
+#
+# --scalar: disables SIMD at the target-feature level. On x86_64 drops AVX2
+# and AVX-512 (Goldilocks + most of Keccak go scalar, residual SSE2 in
+# p3-keccak). On aarch64 drops the SHA3 NEON extension. Triggers a rebuild
+# when toggling; subsequent runs with the same RUSTFLAGS are cached.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+TMP_DIR="/tmp/bench_p3"
+REPORT_DIR=""
+NO_COLOR=false
+NO_P3_PATCH=false
+SCALAR=false
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BOLD='\033[1m'
+NC='\033[0m'
+
+LOG_ROWS=()
+NUM_SEQUENCES=16
+RUNS=3
+RUN_LAMBDA=true
+RUN_P3=true
+
+# --- Parse args -------------------------------------------------------------
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --log-rows)
+            shift
+            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
+                LOG_ROWS+=("$1")
+                shift
+            done
+            ;;
+        --num-sequences)
+            if [[ $# -lt 2 ]]; then echo "--num-sequences requires an argument"; exit 1; fi
+            NUM_SEQUENCES=$2
+            shift 2
+            ;;
+        --runs)
+            if [[ $# -lt 2 ]]; then echo "--runs requires an argument"; exit 1; fi
+            RUNS=$2
+            shift 2
+            ;;
+        --lambda-only)
+            RUN_P3=false
+            shift
+            ;;
+        --p3-only)
+            RUN_LAMBDA=false
+            shift
+            ;;
+        --report-dir)
+            if [[ $# -lt 2 ]]; then echo "--report-dir requires an argument"; exit 1; fi
+            REPORT_DIR=$2
+            shift 2
+            ;;
+        --no-p3-patch)
+            NO_P3_PATCH=true
+            shift
+            ;;
+        --scalar)
+            SCALAR=true
+            shift
+            ;;
+        --no-color)
+            NO_COLOR=true
+            shift
+            ;;
+        -h|--help)
+            sed -n '2,11p' "$0" | sed 's/^# //'
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+if [ ${#LOG_ROWS[@]} -eq 0 ]; then
+    LOG_ROWS=(19)
+fi
+
+if ! $RUN_LAMBDA && ! $RUN_P3; then
+    echo "At least one prover must be enabled"
+    exit 1
+fi
+
+if [ "$RUNS" -lt 1 ]; then
+    echo "--runs must be >= 1"
+    exit 1
+fi
+
+if $NO_COLOR; then
+    RED=''
+    GREEN=''
+    YELLOW=''
+    BOLD=''
+    NC=''
+fi
+
+mkdir -p "$TMP_DIR"
+rm -rf "$TMP_DIR"/*
+
+if [ -n "$REPORT_DIR" ]; then
+    mkdir -p "$REPORT_DIR/raw"
+fi
+
+# --- Patch toggle -----------------------------------------------------------
+# The root Cargo.toml has a [patch.crates-io] block pointing at the vendored
+# p3-goldilocks-patched (adds BinomiallyExtendable<3>, disables NEON). For the
+# nightly we build against vanilla crates.io p3-goldilocks — we comment the
+# block out and drop the `p3-degree3` feature.
+CARGO_TOML="$ROOT_DIR/Cargo.toml"
+CARGO_TOML_BAK=""
+BUILD_FEATURE_FLAGS=()
+if $NO_P3_PATCH; then
+    CARGO_TOML_BAK="$CARGO_TOML.bak.p3bench.$$"
+    cp "$CARGO_TOML" "$CARGO_TOML_BAK"
+    # Comment the [patch.crates-io] block and its entries (until the next blank
+    # line or next [section]).
+    python3 - "$CARGO_TOML" <<'PY'
+import sys, pathlib
+path = pathlib.Path(sys.argv[1])
+lines = path.read_text().splitlines(keepends=True)
+out = []
+in_patch = False
+for ln in lines:
+    stripped = ln.strip()
+    if stripped == "[patch.crates-io]":
+        in_patch = True
+        out.append("# " + ln if not ln.startswith("#") else ln)
+        continue
+    if in_patch:
+        if stripped.startswith("[") and stripped.endswith("]"):
+            in_patch = False
+            out.append(ln)
+            continue
+        if stripped == "":
+            in_patch = False
+            out.append(ln)
+            continue
+        out.append("# " + ln if not ln.startswith("#") else ln)
+    else:
+        out.append(ln)
+path.write_text("".join(out))
+PY
+    trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi' EXIT INT TERM
+    BUILD_FEATURE_FLAGS=(--no-default-features --features parallel)
+fi
+
+# --- Scalar (no SIMD) toggle ------------------------------------------------
+# When --scalar is on, disable vector instruction sets for the build so both
+# provers run against the same scalar baseline. p3-keccak keeps SSE2 residual
+# on x86 — acceptable per the bench workstream (contribution is ~7%).
+#   x86_64   → -avx2,-avx512f         (Goldilocks + most of Keccak go scalar)
+#   aarch64  → -sha3                   (drops Keccak NEON SHA3 extension)
+# Cargo caches per-RUSTFLAGS, so toggling scalar vs vector triggers a rebuild
+# on first use but is cached afterwards.
+SCALAR_RUSTFLAGS=""
+if $SCALAR; then
+    case "$(uname -m)" in
+        x86_64|amd64)
+            SCALAR_RUSTFLAGS="-C target-feature=-avx2,-avx512f"
+            ;;
+        arm64|aarch64)
+            SCALAR_RUSTFLAGS="-C target-feature=-sha3"
+            ;;
+        *)
+            echo "warning: --scalar: unknown arch $(uname -m); not pinning RUSTFLAGS" >&2
+            ;;
+    esac
+    if [ -n "$SCALAR_RUSTFLAGS" ]; then
+        if [ -n "${RUSTFLAGS:-}" ]; then
+            export RUSTFLAGS="${RUSTFLAGS} ${SCALAR_RUSTFLAGS}"
+        else
+            export RUSTFLAGS="$SCALAR_RUSTFLAGS"
+        fi
+    fi
+fi
+
+# --- Build ------------------------------------------------------------------
+echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}"
+echo -e "  log-rows:       ${YELLOW}${LOG_ROWS[*]}${NC}"
+echo -e "  num-sequences:  ${YELLOW}${NUM_SEQUENCES}${NC}  (columns = $((2 * NUM_SEQUENCES)))"
+echo -e "  runs/size:      ${YELLOW}${RUNS}${NC}  (median reported)"
+if $NO_P3_PATCH; then
+    echo -e "  p3 extension:   ${YELLOW}degree 2 (vanilla, no patch)${NC}"
+else
+    echo -e "  p3 extension:   ${YELLOW}degree 3 (patched, matches Lambda)${NC}"
+fi
+if $SCALAR; then
+    echo -e "  scalar mode:    ${YELLOW}on${NC}  (arch=$(uname -m), RUSTFLAGS=\"${RUSTFLAGS:-}\")"
+else
+    echo -e "  scalar mode:    ${YELLOW}off${NC}  (SIMD enabled, compiler default)"
+fi
+echo ""
+
+echo -e "${GREEN}[build]${NC} prove_bench"
+# Use the `${arr[@]+...}` expansion so `set -u` doesn't blow up when the
+# feature-flag array is empty (bash 3 on macOS).
+cargo build --release -p bench-vs-plonky3 --bin prove_bench \
+    --manifest-path "$ROOT_DIR/Cargo.toml" \
+    ${BUILD_FEATURE_FLAGS[@]+"${BUILD_FEATURE_FLAGS[@]}"} 2>&1 | tail -5
+
+BIN="$ROOT_DIR/target/release/prove_bench"
+if [ ! -x "$BIN" ]; then
+    echo -e "${RED}[build] prove_bench not produced at $BIN${NC}"
+    exit 1
+fi
+
+# --- Helpers ----------------------------------------------------------------
+extract_proving_time() {
+    sed -nE '/Proving time: [0-9.]+s/ {
+        s/.*Proving time: ([0-9.]+)s.*/\1/
+        p
+        q
+    }'
+}
+
+median_of() {
+    # prints median of the given numeric arguments (rounded to 3 decimals).
+    # Uses shell `sort -g` for portability (macOS awk lacks gawk's asort).
+    printf '%s\n' "$@" | LC_ALL=C sort -g | LC_NUMERIC=C awk '
+        { a[NR] = $0 + 0 }
+        END {
+            if (NR == 0) { print "n/a"; exit }
+            if (NR % 2 == 1) {
+                printf "%.3f\n", a[(NR + 1) / 2]
+            } else {
+                printf "%.3f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2
+            }
+        }'
+}
+
+ratio_fmt() {
+    LC_NUMERIC=C awk -v num="$1" -v den="$2" 'BEGIN {
+        if (den + 0 == 0) { print "n/a"; exit }
+        printf "%.3f\n", num / den
+    }'
+}
+
+# --- Run benchmark ----------------------------------------------------------
+
+RESULT_LOG_ROWS=()
+RESULT_ROWS=()
+RESULT_LAMBDA=()
+RESULT_P3=()
+RESULT_RATIO=()
+
+run_prover() {
+    local prover=$1   # lambda | p3
+    local log_rows=$2
+    local times=()
+    for run_i in $(seq 1 "$RUNS"); do
+        local out_file="$TMP_DIR/${prover}_${log_rows}_${run_i}.stdout"
+        if ! "$BIN" --prover "$prover" \
+                --log-rows "$log_rows" \
+                --num-sequences "$NUM_SEQUENCES" > "$out_file" 2>&1; then
+            echo -e "  ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}"
+            cat "$out_file"
+            exit 1
+        fi
+        local t
+        t=$(extract_proving_time < "$out_file")
+        if [ -z "$t" ]; then
+            echo -e "  ${RED}[${prover}] could not parse proving time (log-rows=${log_rows}, run ${run_i})${NC}"
+            cat "$out_file"
+            exit 1
+        fi
+        times+=("$t")
+        if [ -n "$REPORT_DIR" ]; then
+            cp "$out_file" "$REPORT_DIR/raw/${prover}_log${log_rows}_run${run_i}.stdout"
+        fi
+    done
+    median_of "${times[@]}"
+    printf '%s\n' "${times[@]}" > "$TMP_DIR/${prover}_${log_rows}.times"
+}
+
+for lr in "${LOG_ROWS[@]}"; do
+    rows=$((1 << lr))
+    echo -e "${BOLD}--- log-rows=${lr}  (rows = ${rows}) ---${NC}"
+
+    lambda_median="n/a"
+    p3_median="n/a"
+
+    if $RUN_LAMBDA; then
+        echo -ne "  ${GREEN}[lambda]${NC} "
+        lambda_median=$(run_prover lambda "$lr")
+        echo "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")"
+    fi
+
+    if $RUN_P3; then
+        echo -ne "  ${GREEN}[p3]${NC}     "
+        p3_median=$(run_prover p3 "$lr")
+        echo "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")"
+    fi
+
+    local_ratio="n/a"
+    if $RUN_LAMBDA && $RUN_P3; then
+        local_ratio=$(ratio_fmt "$lambda_median" "$p3_median")
+    fi
+
+    RESULT_LOG_ROWS+=("$lr")
+    RESULT_ROWS+=("$rows")
+    RESULT_LAMBDA+=("$lambda_median")
+    RESULT_P3+=("$p3_median")
+    RESULT_RATIO+=("$local_ratio")
+done
+
+# --- Summary table ----------------------------------------------------------
+
+echo ""
+echo -e "${BOLD}=== Summary ===${NC}"
+if $RUN_LAMBDA && $RUN_P3; then
+    printf "  %-9s  %-12s  %14s  %14s  %10s\n" "log-rows" "rows" "Lambda (s)" "P3 (s)" "L/P3"
+    printf "  %-9s  %-12s  %14s  %14s  %10s\n" "--------" "----" "----------" "------" "----"
+else
+    printf "  %-9s  %-12s  %14s\n" "log-rows" "rows" "Time (s)"
+    printf "  %-9s  %-12s  %14s\n" "--------" "----" "--------"
+fi
+
+for i in "${!RESULT_LOG_ROWS[@]}"; do
+    lr="${RESULT_LOG_ROWS[$i]}"
+    rows="${RESULT_ROWS[$i]}"
+    lt="${RESULT_LAMBDA[$i]}"
+    pt="${RESULT_P3[$i]}"
+    rt="${RESULT_RATIO[$i]}"
+    if $RUN_LAMBDA && $RUN_P3; then
+        color=$GREEN
+        if awk -v l="$lt" -v p="$pt" 'BEGIN{ exit !(l+0 > p+0) }'; then
+            color=$RED
+        fi
+        printf "  %-9s  %-12s  %13ss  %13ss  ${color}%9sx${NC}\n" \
+            "$lr" "$rows" "$lt" "$pt" "$rt"
+    elif $RUN_LAMBDA; then
+        printf "  %-9s  %-12s  %13ss\n" "$lr" "$rows" "$lt"
+    else
+        printf "  %-9s  %-12s  %13ss\n" "$lr" "$rows" "$pt"
+    fi
+done
+
+echo ""
+if $RUN_LAMBDA && $RUN_P3; then
+    echo -e "Timing window: single-shot end-to-end prove. Ratio < 1 → Lambda faster."
+fi
+if $NO_P3_PATCH; then
+    echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2."
+    echo -e "      Lambda keeps degree-3 — extension fields differ across sides."
+fi
+
+# --- Machine-readable report ------------------------------------------------
+
+if [ -n "$REPORT_DIR" ]; then
+    {
+        printf "log_rows\trows\tlambda_median_s\tp3_median_s\tratio_lambda_over_p3\truns\n"
+        for i in "${!RESULT_LOG_ROWS[@]}"; do
+            printf "%s\t%s\t%s\t%s\t%s\t%s\n" \
+                "${RESULT_LOG_ROWS[$i]}" \
+                "${RESULT_ROWS[$i]}" \
+                "${RESULT_LAMBDA[$i]}" \
+                "${RESULT_P3[$i]}" \
+                "${RESULT_RATIO[$i]}" \
+                "$RUNS"
+        done
+    } > "$REPORT_DIR/results.tsv"
+
+    {
+        echo "# Lambda STARK vs Plonky3 Benchmark"
+        echo
+        echo "Timing window: \`single-shot end-to-end prove\` (no verification)."
+        echo "num-sequences: \`$NUM_SEQUENCES\`, columns: \`$((2 * NUM_SEQUENCES))\`, blowup: 2, fri_queries: 219, grinding: 0."
+        echo "runs per size: \`$RUNS\` (median reported)."
+        echo "arch: \`$(uname -m)\`, scalar mode: \`$($SCALAR && echo on || echo off)\`."
+        if $SCALAR && [ -n "$SCALAR_RUSTFLAGS" ]; then
+            echo "RUSTFLAGS: \`$SCALAR_RUSTFLAGS\`."
+        fi
+        if $NO_P3_PATCH; then
+            echo
+            echo "> Plonky3 built without the vendored degree-3 patch: Challenge type is degree-2 (vanilla crates.io p3-goldilocks 0.5.2). Lambda still uses degree 3."
+        fi
+        echo
+        echo "| log-rows | rows | Lambda (s) | P3 (s) | Lambda / P3 |"
+        echo "|---------:|-----:|-----------:|-------:|------------:|"
+        for i in "${!RESULT_LOG_ROWS[@]}"; do
+            printf "| %s | %s | %s | %s | %s |\n" \
+                "${RESULT_LOG_ROWS[$i]}" \
+                "${RESULT_ROWS[$i]}" \
+                "${RESULT_LAMBDA[$i]}" \
+                "${RESULT_P3[$i]}" \
+                "${RESULT_RATIO[$i]}"
+        done
+    } > "$REPORT_DIR/summary.md"
+fi
diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs
new file mode 100644
index 000000000..cb58aea42
--- /dev/null
+++ b/bench_vs_plonky3/src/bin/prove_bench.rs
@@ -0,0 +1,185 @@
+//! Minimal wall-clock benchmark harness for Lambda STARK vs Plonky3.
+//!
+//! Builds the same Fibonacci AIR as `instruments_breakdown` (but without any
+//! instrumentation) and prints a single line `Proving time: X.XXXs` to
+//! stdout, suitable for parsing by `bench_vs_plonky3/run.sh`.
+//!
+//! Usage:
+//!   prove_bench --prover {lambda|p3} [--log-rows K] [--num-sequences N]
+//!               [--blowup B] [--queries Q] [--grinding G]
+//!
+//! Defaults match production (`GoldilocksCubicProofOptions::with_blowup(2)`):
+//!   log-rows=19, num-sequences=16, blowup=2, queries=219, grinding=0.
+
+use std::process::ExitCode;
+use std::time::Instant;
+
+use bench_vs_plonky3::{lambda_fibonacci_pair, plonky3_config, plonky3_fibonacci};
+use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+use math::field::element::FieldElement;
+use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
+use math::field::goldilocks::GoldilocksField;
+use stark::proof::options::ProofOptions;
+use stark::prover::{IsStarkProver, Prover};
+
+type F = GoldilocksField;
+type E = Degree3GoldilocksExtensionField;
+type FE = FieldElement<F>;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum ProverKind {
+    Lambda,
+    P3,
+}
+
+struct Args {
+    prover: ProverKind,
+    log_rows: u32,
+    num_sequences: usize,
+    blowup: u8,
+    queries: usize,
+    grinding: u8,
+}
+
+impl Default for Args {
+    fn default() -> Self {
+        Self {
+            prover: ProverKind::Lambda,
+            log_rows: 19,
+            num_sequences: 16,
+            blowup: 2,
+            queries: 219,
+            grinding: 0,
+        }
+    }
+}
+
+fn print_usage() {
+    eprintln!(
+        "usage: prove_bench --prover {{lambda|p3}} \
+         [--log-rows K] [--num-sequences N] \
+         [--blowup B] [--queries Q] [--grinding G]"
+    );
+}
+
+fn parse_args() -> Result<Args, String> {
+    let mut args = Args::default();
+    let mut prover_set = false;
+    let mut iter = std::env::args().skip(1);
+    while let Some(a) = iter.next() {
+        match a.as_str() {
+            "--prover" => {
+                let v = iter.next().ok_or("--prover needs a value")?;
+                args.prover = match v.as_str() {
+                    "lambda" => ProverKind::Lambda,
+                    "p3" => ProverKind::P3,
+                    other => return Err(format!("unknown prover: {other}")),
+                };
+                prover_set = true;
+            }
+            "--log-rows" => {
+                let v = iter.next().ok_or("--log-rows needs a value")?;
+                args.log_rows = v.parse().map_err(|_| "--log-rows: invalid u32")?;
+            }
+            "--num-sequences" => {
+                let v = iter.next().ok_or("--num-sequences needs a value")?;
+                args.num_sequences = v.parse().map_err(|_| "--num-sequences: invalid usize")?;
+            }
+            "--blowup" => {
+                let v = iter.next().ok_or("--blowup needs a value")?;
+                args.blowup = v.parse().map_err(|_| "--blowup: invalid u8")?;
+            }
+            "--queries" => {
+                let v = iter.next().ok_or("--queries needs a value")?;
+                args.queries = v.parse().map_err(|_| "--queries: invalid usize")?;
+            }
+            "--grinding" => {
+                let v = iter.next().ok_or("--grinding needs a value")?;
+                args.grinding = v.parse().map_err(|_| "--grinding: invalid u8")?;
+            }
+            "-h" | "--help" => {
+                print_usage();
+                std::process::exit(0);
+            }
+            other => return Err(format!("unknown arg: {other}")),
+        }
+    }
+    if !prover_set {
+        return Err("--prover is required".into());
+    }
+    if args.log_rows < 2 || args.log_rows > 30 {
+        return Err("--log-rows must be in [2, 30]".into());
+    }
+    if args.num_sequences == 0 {
+        return Err("--num-sequences must be > 0".into());
+    }
+    Ok(args)
+}
+
+fn proof_options(args: &Args) -> ProofOptions {
+    ProofOptions {
+        blowup_factor: args.blowup,
+        fri_number_of_queries: args.queries,
+        coset_offset: 3,
+        grinding_factor: args.grinding,
+    }
+}
+
+fn run_lambda(args: &Args) -> std::time::Duration {
+    let rows = 1usize << args.log_rows;
+    let options = proof_options(args);
+
+    let initial_values: Vec<(FE, FE)> = (0..args.num_sequences)
+        .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
+        .collect();
+
+    let mut trace = lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
+    let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values);
+    let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
+        &options,
+        args.num_sequences,
+    );
+
+    let start = Instant::now();
+    let _proof = Prover::<F, E, _>::prove(
+        &air,
+        &mut trace,
+        &pub_inputs,
+        &mut DefaultTranscript::<E>::new(&[]),
+    )
+    .expect("lambda prove failed");
+    start.elapsed()
+}
+
+fn run_p3(args: &Args) -> std::time::Duration {
+    let rows = 1usize << args.log_rows;
+    let config = plonky3_config::matched_params_config();
+    let air = plonky3_fibonacci::P3FibonacciAir {
+        num_sequences: args.num_sequences,
+    };
+    let trace = plonky3_fibonacci::generate_fibonacci_trace(args.num_sequences, rows);
+    let pis = plonky3_fibonacci::public_values(args.num_sequences);
+
+    let start = Instant::now();
+    let _proof = p3_uni_stark::prove(&config, &air, trace, &pis);
+    start.elapsed()
+}
+
+fn main() -> ExitCode {
+    let args = match parse_args() {
+        Ok(a) => a,
+        Err(e) => {
+            eprintln!("error: {e}");
+            print_usage();
+            return ExitCode::from(2);
+        }
+    };
+
+    let elapsed = match args.prover {
+        ProverKind::Lambda => run_lambda(&args),
+        ProverKind::P3 => run_p3(&args),
+    };
+
+    println!("Proving time: {:.3}s", elapsed.as_secs_f64());
+    ExitCode::SUCCESS
+}
diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
new file mode 100644
index 000000000..2f1fd4990
--- /dev/null
+++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
@@ -0,0 +1,326 @@
+//! Lambda AIR matching Plonky3's `P3FibonacciAir` exactly in shape.
+//!
+//! Each sequence uses 2 columns (`left`, `right`) with a 2-row transition
+//! window, packing 2 Fibonacci steps per row:
+//!
+//!   `local.left  = x_{2i}`
+//!   `local.right = x_{2i+1}`
+//!   `next.left   = x_{2i+2} = local.left + local.right`
+//!   `next.right  = x_{2i+3} = local.right + next.left`
+//!
+//! For `num_sequences` sequences:
+//!   - columns = `2 * num_sequences`
+//!   - transition constraints = `2 * num_sequences`
+//!   - boundary constraints = `2 * num_sequences` (pin `(a, b)` at row 0)
+//!
+//! This matches `P3FibonacciAir` cell-by-cell; only the prover internals
+//! (multi_prove vs uni-stark, degree-3 vs degree-2 extension) differ.
+
+use std::marker::PhantomData;
+
+use math::field::{
+    element::FieldElement,
+    traits::{IsFFTField, IsField, IsSubFieldOf},
+};
+use stark::{
+    constraints::{
+        boundary::{BoundaryConstraint, BoundaryConstraints},
+        transition::TransitionConstraint,
+    },
+    context::AirContext,
+    proof::options::ProofOptions,
+    trace::TraceTable,
+    traits::{AIR, TransitionEvaluationContext},
+};
+
+/// `next.left = local.left + local.right`  (advances 2 Fibonacci steps)
+#[derive(Clone)]
+pub struct FibPairShiftConstraint<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    seq_idx: usize,
+    constraint_idx: usize,
+    phantom_f: PhantomData<F>,
+    phantom_e: PhantomData<E>,
+}
+
+impl<F, E> FibPairShiftConstraint<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    pub fn new(seq_idx: usize, constraint_idx: usize) -> Self {
+        Self {
+            seq_idx,
+            constraint_idx,
+            phantom_f: PhantomData,
+            phantom_e: PhantomData,
+        }
+    }
+}
+
+impl<F, E> TransitionConstraint<F, E> for FibPairShiftConstraint<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    fn degree(&self) -> usize {
+        1
+    }
+
+    fn constraint_idx(&self) -> usize {
+        self.constraint_idx
+    }
+
+    fn end_exemptions(&self) -> usize {
+        1
+    }
+
+    fn evaluate(
+        &self,
+        eval_ctx: &TransitionEvaluationContext<F, E>,
+        out: &mut [FieldElement<E>],
+    ) {
+        match eval_ctx {
+            TransitionEvaluationContext::Prover { frame, .. } => {
+                let s0 = frame.get_evaluation_step(0);
+                let s1 = frame.get_evaluation_step(1);
+                let local_left = s0.get_main_evaluation_element(0, 2 * self.seq_idx);
+                let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+                let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx);
+                let res = next_left - local_left - local_right;
+                out[self.constraint_idx] = res.to_extension();
+            }
+            TransitionEvaluationContext::Verifier { frame, .. } => {
+                let s0 = frame.get_evaluation_step(0);
+                let s1 = frame.get_evaluation_step(1);
+                let local_left = s0.get_main_evaluation_element(0, 2 * self.seq_idx);
+                let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+                let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx);
+                let res = next_left - local_left - local_right;
+                out[self.constraint_idx] = res;
+            }
+        }
+    }
+}
+
+/// `next.right = local.right + next.left`
+#[derive(Clone)]
+pub struct FibPairSumConstraint<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    seq_idx: usize,
+    constraint_idx: usize,
+    phantom_f: PhantomData<F>,
+    phantom_e: PhantomData<E>,
+}
+
+impl<F, E> FibPairSumConstraint<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    pub fn new(seq_idx: usize, constraint_idx: usize) -> Self {
+        Self {
+            seq_idx,
+            constraint_idx,
+            phantom_f: PhantomData,
+            phantom_e: PhantomData,
+        }
+    }
+}
+
+impl<F, E> TransitionConstraint<F, E> for FibPairSumConstraint<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    fn degree(&self) -> usize {
+        1
+    }
+
+    fn constraint_idx(&self) -> usize {
+        self.constraint_idx
+    }
+
+    fn end_exemptions(&self) -> usize {
+        1
+    }
+
+    fn evaluate(
+        &self,
+        eval_ctx: &TransitionEvaluationContext<F, E>,
+        out: &mut [FieldElement<E>],
+    ) {
+        match eval_ctx {
+            TransitionEvaluationContext::Prover { frame, .. } => {
+                let s0 = frame.get_evaluation_step(0);
+                let s1 = frame.get_evaluation_step(1);
+                let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+                let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx);
+                let next_right = s1.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+                let res = next_right - local_right - next_left;
+                out[self.constraint_idx] = res.to_extension();
+            }
+            TransitionEvaluationContext::Verifier { frame, .. } => {
+                let s0 = frame.get_evaluation_step(0);
+                let s1 = frame.get_evaluation_step(1);
+                let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+                let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx);
+                let next_right = s1.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+                let res = next_right - local_right - next_left;
+                out[self.constraint_idx] = res;
+            }
+        }
+    }
+}
+
+/// Public inputs: initial `(a, b) = (left, right)` pair for each sequence.
+#[derive(Clone, Debug)]
+pub struct FibonacciPairPublicInputs<F: IsFFTField> {
+    pub initial_values: Vec<(FieldElement<F>, FieldElement<F>)>,
+}
+
+/// Multi-sequence Fibonacci AIR with 2-row window, matching Plonky3's `P3FibonacciAir`.
+pub struct FibonacciPairMultiColAIR<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    context: AirContext,
+    constraints: Vec<Box<dyn TransitionConstraint<F, E>>>,
+    num_sequences: usize,
+}
+
+impl<F, E> AIR for FibonacciPairMultiColAIR<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync + 'static,
+    E: IsField + Send + Sync + 'static,
+{
+    type Field = F;
+    type FieldExtension = E;
+    type PublicInputs = FibonacciPairPublicInputs<Self::Field>;
+
+    fn step_size(&self) -> usize {
+        1
+    }
+
+    fn new(proof_options: &ProofOptions) -> Self {
+        Self::with_num_sequences(proof_options, 2)
+    }
+
+    fn composition_poly_degree_bound(&self, trace_length: usize) -> usize {
+        trace_length
+    }
+
+    fn transition_constraints(&self) -> &Vec<Box<dyn TransitionConstraint<F, E>>> {
+        &self.constraints
+    }
+
+    fn boundary_constraints(
+        &self,
+        pub_inputs: &Self::PublicInputs,
+        _rap_challenges: &[FieldElement<Self::FieldExtension>],
+        _bus_public_inputs: Option<&stark::lookup::BusPublicInputs<Self::FieldExtension>>,
+        _trace_length: usize,
+    ) -> BoundaryConstraints<Self::FieldExtension> {
+        let mut constraints = Vec::with_capacity(2 * pub_inputs.initial_values.len());
+        for (seq_idx, (a, b)) in pub_inputs.initial_values.iter().enumerate() {
+            constraints.push(BoundaryConstraint::new_main(
+                2 * seq_idx,
+                0,
+                a.clone().to_extension(),
+            ));
+            constraints.push(BoundaryConstraint::new_main(
+                2 * seq_idx + 1,
+                0,
+                b.clone().to_extension(),
+            ));
+        }
+        BoundaryConstraints::from_constraints(constraints)
+    }
+
+    fn context(&self) -> &AirContext {
+        &self.context
+    }
+
+    fn trace_layout(&self) -> (usize, usize) {
+        (2 * self.num_sequences, 0)
+    }
+}
+
+impl<F, E> FibonacciPairMultiColAIR<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync + 'static,
+    E: IsField + Send + Sync + 'static,
+{
+    pub fn with_num_sequences(proof_options: &ProofOptions, num_sequences: usize) -> Self {
+        let mut constraints: Vec<Box<dyn TransitionConstraint<F, E>>> =
+            Vec::with_capacity(2 * num_sequences);
+        for seq in 0..num_sequences {
+            constraints.push(Box::new(FibPairShiftConstraint::new(seq, 2 * seq)));
+            constraints.push(Box::new(FibPairSumConstraint::new(seq, 2 * seq + 1)));
+        }
+
+        let context = AirContext {
+            proof_options: proof_options.clone(),
+            trace_columns: 2 * num_sequences,
+            transition_offsets: vec![0, 1],
+            num_transition_constraints: 2 * num_sequences,
+        };
+
+        Self {
+            context,
+            constraints,
+            num_sequences,
+        }
+    }
+}
+
+/// Computes the packed Fibonacci trace.
+///
+/// Each row holds `(x_{2i}, x_{2i+1})` for each sequence. Identical values to
+/// `plonky3_fibonacci::generate_fibonacci_trace` at the same coordinates.
+pub fn compute_trace<F, E>(
+    initial_values: &[(FieldElement<F>, FieldElement<F>)],
+    trace_length: usize,
+) -> TraceTable<F, E>
+where
+    F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
+    E: IsField + Send + Sync,
+{
+    let num_sequences = initial_values.len();
+    let mut columns: Vec<Vec<FieldElement<F>>> = Vec::with_capacity(2 * num_sequences);
+
+    for (a, b) in initial_values {
+        let mut left_col = Vec::with_capacity(trace_length);
+        let mut right_col = Vec::with_capacity(trace_length);
+
+        let mut left = a.clone();
+        let mut right = b.clone();
+
+        for _ in 0..trace_length {
+            left_col.push(left.clone());
+            right_col.push(right.clone());
+            let new_left = left.clone() + right.clone();
+            let new_right = right.clone() + new_left.clone();
+            left = new_left;
+            right = new_right;
+        }
+
+        columns.push(left_col);
+        columns.push(right_col);
+    }
+
+    TraceTable::from_columns_main(columns, 1)
+}
+
+pub fn create_public_inputs<F: IsFFTField>(
+    initial_values: Vec<(FieldElement<F>, FieldElement<F>)>,
+) -> FibonacciPairPublicInputs<F> {
+    FibonacciPairPublicInputs { initial_values }
+}
diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs
new file mode 100644
index 000000000..224ad5fa9
--- /dev/null
+++ b/bench_vs_plonky3/src/lib.rs
@@ -0,0 +1,341 @@
+pub mod lambda_fibonacci_pair;
+pub mod plonky3_config;
+pub mod plonky3_fibonacci;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crypto::fiat_shamir::default_transcript::DefaultTranscript;
+    use math::field::element::FieldElement;
+    use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
+    use math::field::goldilocks::GoldilocksField;
+    use p3_field::PrimeField64;
+    use p3_uni_stark::{prove, verify};
+    use stark::proof::options::ProofOptions;
+    use stark::prover::{IsStarkProver, Prover};
+    use stark::verifier::{IsStarkVerifier, Verifier};
+
+    type F = GoldilocksField;
+    type E = Degree3GoldilocksExtensionField;
+    type FE = FieldElement<F>;
+
+    fn benchmark_proof_options() -> ProofOptions {
+        ProofOptions {
+            blowup_factor: 2,
+            fri_number_of_queries: 219,
+            coset_offset: 3,
+            grinding_factor: 0,
+        }
+    }
+
+    #[test]
+    fn lambda_fibonacci_pair_prove_verify() {
+        let num_sequences = 2;
+        let trace_length = 128; // 2^7
+        let proof_options = benchmark_proof_options();
+
+        let initial_values: Vec<(FE, FE)> = (0..num_sequences)
+            .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
+            .collect();
+
+        let mut trace =
+            lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, trace_length);
+        let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values);
+        let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
+            &proof_options,
+            num_sequences,
+        );
+
+        let proof = Prover::<F, E, _>::prove(
+            &air,
+            &mut trace,
+            &pub_inputs,
+            &mut DefaultTranscript::<E>::new(&[]),
+        )
+        .unwrap();
+
+        assert!(Verifier::<F, E, _>::verify(
+            &proof,
+            &air,
+            &mut DefaultTranscript::<E>::new(&[]),
+        ));
+    }
+
+    #[test]
+    fn plonky3_fibonacci_prove_verify() {
+        let num_sequences = 2;
+        let rows = 128; // 2^7
+
+        let config = plonky3_config::matched_params_config();
+        let air = plonky3_fibonacci::P3FibonacciAir { num_sequences };
+        let trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows);
+        let pis = plonky3_fibonacci::public_values(num_sequences);
+
+        let proof = prove(&config, &air, trace, &pis);
+        verify(&config, &air, &proof, &pis).expect("Plonky3 verification failed");
+    }
+
+    /// Lambda prove with instruments breakdown + P3 span-based breakdown.
+    /// Run: cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture
+    #[test]
+    fn instruments_breakdown() {
+        let num_sequences = 16;
+        let rows = 1 << 18;
+        let proof_options = benchmark_proof_options();
+
+        let initial_values: Vec<(FE, FE)> = (0..num_sequences)
+            .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
+            .collect();
+
+        let mut trace =
+            lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
+        let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values);
+        let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
+            &proof_options,
+            num_sequences,
+        );
+
+        let start = std::time::Instant::now();
+        let _proof = Prover::<F, E, _>::prove(
+            &air,
+            &mut trace,
+            &pub_inputs,
+            &mut DefaultTranscript::<E>::new(&[]),
+        )
+        .unwrap();
+        let total = start.elapsed();
+
+        println!("\n============================================================");
+        println!(
+            "Lambda STARK Instruments (blowup={}, queries={})",
+            proof_options.blowup_factor, proof_options.fri_number_of_queries
+        );
+        println!("Trace: {} rows x {} cols", rows, 2 * num_sequences);
+        println!("Total prove: {:.3}s", total.as_secs_f64());
+
+        #[cfg(feature = "instruments")]
+        if let Some(timing) = stark::instruments::take() {
+            println!("\n--- High-level phases ---");
+            println!(
+                "  Pre-pass:            {:>8.1}ms",
+                timing.prepass.as_secs_f64() * 1000.0
+            );
+            println!(
+                "  R1 Main commits:     {:>8.1}ms",
+                timing.main_commits.as_secs_f64() * 1000.0
+            );
+            println!(
+                "  R1 Aux build:        {:>8.1}ms",
+                timing.aux_build.as_secs_f64() * 1000.0
+            );
+            println!(
+                "  R1 Aux commit:       {:>8.1}ms",
+                timing.aux_commit.as_secs_f64() * 1000.0
+            );
+            println!(
+                "  Rounds 2-4:          {:>8.1}ms",
+                timing.rounds_2_4.as_secs_f64() * 1000.0
+            );
+
+            let r1 = &timing.round1_sub;
+            println!("\n--- Round 1 sub-ops ---");
+            println!(
+                "  Main LDE (FFT):      {:>8.1}ms",
+                r1.main_lde.as_secs_f64() * 1000.0
+            );
+            println!(
+                "  Main Merkle:         {:>8.1}ms",
+                r1.main_merkle.as_secs_f64() * 1000.0
+            );
+
+            for (name, tbl_rows, dur, sub) in &timing.table_timings {
+                println!(
+                    "\n--- Rounds 2-4: {} ({} rows, {:.1}ms) ---",
+                    name,
+                    tbl_rows,
+                    dur.as_secs_f64() * 1000.0
+                );
+                println!(
+                    "  R2 constraint eval:{:>8.1}ms  ({:.0}%)",
+                    sub.constraints.as_secs_f64() * 1000.0,
+                    sub.constraints.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+                println!(
+                    "  R2 decompose+ext:  {:>8.1}ms  ({:.0}%)",
+                    sub.comp_decompose.as_secs_f64() * 1000.0,
+                    sub.comp_decompose.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+                println!(
+                    "  R2 comp Merkle:    {:>8.1}ms  ({:.0}%)",
+                    sub.comp_commit.as_secs_f64() * 1000.0,
+                    sub.comp_commit.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+                println!(
+                    "  R3 OOD eval:       {:>8.1}ms  ({:.0}%)",
+                    sub.ood.as_secs_f64() * 1000.0,
+                    sub.ood.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+                println!(
+                    "  R4 deep comp:      {:>8.1}ms  ({:.0}%)",
+                    sub.deep_comp.as_secs_f64() * 1000.0,
+                    sub.deep_comp.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+                println!(
+                    "  R4 deep extend:    {:>8.1}ms  ({:.0}%)",
+                    sub.deep_extend.as_secs_f64() * 1000.0,
+                    sub.deep_extend.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+                println!(
+                    "  R4 FRI commit:     {:>8.1}ms  ({:.0}%)",
+                    sub.fri_commit.as_secs_f64() * 1000.0,
+                    sub.fri_commit.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+                println!(
+                    "  R4 queries+open:   {:>8.1}ms  ({:.0}%)",
+                    sub.queries.as_secs_f64() * 1000.0,
+                    sub.queries.as_secs_f64() / total.as_secs_f64() * 100.0
+                );
+            }
+        }
+
+        #[cfg(not(feature = "instruments"))]
+        println!("(rebuild with --features instruments for breakdown)");
+
+        // --- Plonky3 breakdown via tracing spans ---
+        // Captures ALL spans (info + debug) so we see quotient_values, FRI commit, etc.
+        println!("\n============================================================");
+        println!("Plonky3 STARK Span Breakdown");
+
+        use std::collections::HashMap;
+        use std::sync::{Arc, Mutex};
+        use tracing_subscriber::layer::SubscriberExt;
+
+        type SpanResults = Arc<Mutex<Vec<(String, f64)>>>;
+
+        struct P3TimingLayer {
+            spans: Mutex<HashMap<u64, (String, Option<std::time::Instant>)>>,
+            results: SpanResults,
+        }
+
+        impl<S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>>
+            tracing_subscriber::Layer<S> for P3TimingLayer
+        {
+            fn on_new_span(
+                &self,
+                attrs: &tracing::span::Attributes<'_>,
+                id: &tracing::span::Id,
+                _ctx: tracing_subscriber::layer::Context<'_, S>,
+            ) {
+                let name = attrs.metadata().name().to_string();
+                self.spans
+                    .lock()
+                    .unwrap()
+                    .insert(id.into_u64(), (name, None));
+            }
+
+            fn on_enter(
+                &self,
+                id: &tracing::span::Id,
+                _ctx: tracing_subscriber::layer::Context<'_, S>,
+            ) {
+                if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) {
+                    entry.1 = Some(std::time::Instant::now());
+                }
+            }
+
+            fn on_close(
+                &self,
+                id: tracing::span::Id,
+                _ctx: tracing_subscriber::layer::Context<'_, S>,
+            ) {
+                if let Some((name, Some(start))) =
+                    self.spans.lock().unwrap().remove(&id.into_u64())
+                {
+                    let ms = start.elapsed().as_secs_f64() * 1000.0;
+                    self.results.lock().unwrap().push((name, ms));
+                }
+            }
+        }
+
+        let results: SpanResults = Arc::new(Mutex::new(Vec::new()));
+        let layer = P3TimingLayer {
+            spans: Mutex::new(HashMap::new()),
+            results: Arc::clone(&results),
+        };
+        let filter = tracing_subscriber::filter::LevelFilter::DEBUG;
+        let subscriber = tracing_subscriber::registry().with(filter).with(layer);
+
+        let config = plonky3_config::matched_params_config();
+        let p3_air = plonky3_fibonacci::P3FibonacciAir { num_sequences };
+        let p3_trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows);
+        let p3_pis = plonky3_fibonacci::public_values(num_sequences);
+
+        let p3_prove_dur;
+        {
+            let _guard = tracing::subscriber::set_default(subscriber);
+            let p3_start = std::time::Instant::now();
+            let _p3_proof = p3_uni_stark::prove(&config, &p3_air, p3_trace, &p3_pis);
+            p3_prove_dur = p3_start.elapsed();
+        }
+
+        let total_ms = p3_prove_dur.as_secs_f64() * 1000.0;
+        println!("  Prove total:  {:.1}ms\n", total_ms);
+
+        // Sort spans by duration descending and print
+        let mut span_data = results.lock().unwrap().clone();
+        span_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+        for (name, ms) in &span_data {
+            if *ms >= 0.1 {
+                println!("  {:.<40} {:>8.1}ms  ({:.0}%)", name, ms, ms / total_ms * 100.0);
+            }
+        }
+        let accounted: f64 = span_data.iter().map(|(_, ms)| ms).sum();
+        let unaccounted = total_ms - accounted;
+        if unaccounted > 1.0 {
+            println!(
+                "  {:.<40} {:>8.1}ms  ({:.0}%)",
+                "(unaccounted)",
+                unaccounted,
+                unaccounted / total_ms * 100.0
+            );
+        }
+        println!("============================================================\n");
+    }
+
+    /// Verifies that the new Lambda pair AIR trace and the Plonky3 trace are
+    /// cell-by-cell identical at the same (row, col) coordinates.
+    #[test]
+    fn lambda_pair_trace_matches_plonky3_trace() {
+        let num_sequences = 3;
+        let rows = 16;
+
+        let initial_values: Vec<(FE, FE)> = (0..num_sequences)
+            .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
+            .collect();
+
+        let lambda_trace =
+            lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
+        let p3_trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows);
+
+        assert_eq!(p3_trace.width, 2 * num_sequences);
+        for row in 0..rows {
+            for seq in 0..num_sequences {
+                let p3_left = p3_trace.values[row * p3_trace.width + 2 * seq].as_canonical_u64();
+                let p3_right =
+                    p3_trace.values[row * p3_trace.width + 2 * seq + 1].as_canonical_u64();
+
+                assert_eq!(
+                    FE::from(p3_left),
+                    lambda_trace.get_main(row, 2 * seq).clone(),
+                    "left mismatch at row {row}, seq {seq}"
+                );
+                assert_eq!(
+                    FE::from(p3_right),
+                    lambda_trace.get_main(row, 2 * seq + 1).clone(),
+                    "right mismatch at row {row}, seq {seq}"
+                );
+            }
+        }
+    }
+}
diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs
new file mode 100644
index 000000000..b74f18ad2
--- /dev/null
+++ b/bench_vs_plonky3/src/plonky3_config.rs
@@ -0,0 +1,92 @@
+use p3_challenger::{HashChallenger, SerializingChallenger64};
+use p3_commit::ExtensionMmcs;
+use p3_dft::Radix2DitParallel;
+use p3_field::extension::BinomialExtensionField;
+use p3_fri::{FriParameters, TwoAdicFriPcs};
+use p3_goldilocks::Goldilocks;
+use p3_keccak::{Keccak256Hash, KeccakF};
+use p3_merkle_tree::MerkleTreeMmcs;
+use p3_symmetric::{CompressionFunctionFromHasher, PaddingFreeSponge, SerializingHasher};
+use p3_uni_stark::StarkConfig;
+
+pub type Val = Goldilocks;
+
+/// Cubic extension (default, `p3-degree3` feature): matches Lambda's
+/// `Degree3GoldilocksExtensionField`, irreducible x^3 - 2. Needs the vendored
+/// `p3-goldilocks-patched` crate (enabled via root `[patch.crates-io]`).
+#[cfg(feature = "p3-degree3")]
+pub type Challenge = BinomialExtensionField<Val, 3>;
+
+/// Quadratic extension (vanilla upstream p3-goldilocks 0.5.2). Compiled when
+/// `p3-degree3` is disabled, typically together with commenting the root
+/// `[patch.crates-io]` block. Lambda still runs degree 3, so this is NOT a
+/// fair comparison on the extension field — it is used for nightly tracking
+/// against the off-the-shelf P3 config.
+#[cfg(not(feature = "p3-degree3"))]
+pub type Challenge = BinomialExtensionField<Val, 2>;
+
+type ByteHash = Keccak256Hash;
+type U64Hash = PaddingFreeSponge<KeccakF, 25, 17, 4>;
+type FieldHash = SerializingHasher<U64Hash>;
+type MyCompress = CompressionFunctionFromHasher<U64Hash, 2, 4>;
+pub type ValMmcs = MerkleTreeMmcs<
+    [Val; p3_keccak::VECTOR_LEN],
+    [u64; p3_keccak::VECTOR_LEN],
+    FieldHash,
+    MyCompress,
+    2,
+    4,
+>;
+type ChallengeMmcs = ExtensionMmcs<Val, Challenge, ValMmcs>;
+type Dft = Radix2DitParallel<Val>;
+pub type Pcs = TwoAdicFriPcs<Val, Dft, ValMmcs, ChallengeMmcs>;
+pub type Challenger = SerializingChallenger64<Val, HashChallenger<u8, ByteHash, 32>>;
+
+pub type P3Config = StarkConfig<Pcs, Challenge, Challenger>;
+
+fn build_mmcs() -> (ValMmcs, ChallengeMmcs, ByteHash) {
+    let byte_hash = ByteHash {};
+    let u64_hash = U64Hash::new(KeccakF {});
+    let field_hash = FieldHash::new(u64_hash);
+    let compress = MyCompress::new(u64_hash);
+    let val_mmcs = ValMmcs::new(field_hash, compress, 3);
+    let challenge_mmcs = ChallengeMmcs::new(val_mmcs.clone());
+    (val_mmcs, challenge_mmcs, byte_hash)
+}
+
+/// Creates a Plonky3 STARK config with parameters matched to Lambda's
+/// production config `GoldilocksCubicProofOptions::with_blowup(2)`:
+/// blowup=2, 219 FRI queries, grinding=0 (excluded from benchmark).
+pub fn matched_params_config() -> P3Config {
+    let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs();
+    let dft = Dft::default();
+    let challenger = Challenger::from_hasher(vec![], byte_hash);
+
+    // Match Lambda production: blowup=2, queries=219, grinding=0.
+    // Grinding excluded from benchmark (identical PoW on both sides).
+    let fri_params = FriParameters {
+        log_blowup: 1, // blowup = 2
+        log_final_poly_len: 0,
+        max_log_arity: 1,
+        num_queries: 219,
+        commit_proof_of_work_bits: 0,
+        query_proof_of_work_bits: 0,
+        mmcs: challenge_mmcs,
+    };
+
+    let pcs = Pcs::new(dft, val_mmcs, fri_params);
+    P3Config::new(pcs, challenger)
+}
+
+/// Creates a Plonky3 STARK config with Plonky3's standard benchmark parameters:
+/// blowup=2, 100 FRI queries, 16-bit query PoW.
+pub fn plonky3_benchmark_config() -> P3Config {
+    let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs();
+    let dft = Dft::default();
+    let challenger = Challenger::from_hasher(vec![], byte_hash);
+
+    let fri_params = p3_fri::create_benchmark_fri_params(challenge_mmcs);
+
+    let pcs = Pcs::new(dft, val_mmcs, fri_params);
+    P3Config::new(pcs, challenger)
+}
diff --git a/bench_vs_plonky3/src/plonky3_fibonacci.rs b/bench_vs_plonky3/src/plonky3_fibonacci.rs
new file mode 100644
index 000000000..c55bca8c5
--- /dev/null
+++ b/bench_vs_plonky3/src/plonky3_fibonacci.rs
@@ -0,0 +1,144 @@
+use p3_air::{Air, AirBuilder, BaseAir, WindowAccess};
+use p3_field::PrimeCharacteristicRing;
+use p3_goldilocks::Goldilocks;
+use p3_matrix::dense::RowMajorMatrix;
+
+/// Multi-sequence Fibonacci AIR for Plonky3.
+///
+/// Each sequence uses 2 columns (left, right) in a 2-row window, where each
+/// Plonky3 row stores two consecutive Lambda rows:
+///   local.left  = x_{2i}
+///   local.right = x_{2i+1}
+///   next.left   = x_{2i+2} = local.left + local.right
+///   next.right  = x_{2i+3} = local.right + next.left
+///
+/// This packs two consecutive Lambda trace rows into one Plonky3 row. It is the
+/// closest encoding of Lambda's `row + 2` Fibonacci transition available in
+/// Plonky3's current/next-row AIR window while keeping the same committed cell
+/// count.
+///
+/// Boundary constraints at the first row pin each sequence's initial (a, b)
+/// values against public inputs, matching Lambda's `FibonacciMultiColumnAIR`.
+///
+/// Public values layout: `[a_0, b_0, a_1, b_1, ..., a_{N-1}, b_{N-1}]`
+/// where `N = num_sequences`.
+///
+/// For `num_sequences` sequences, the AIR has `2 * num_sequences` columns
+/// and `2 * num_sequences` public values.
+pub struct P3FibonacciAir {
+    pub num_sequences: usize,
+}
+
+impl<F: PrimeCharacteristicRing> BaseAir<F> for P3FibonacciAir {
+    fn width(&self) -> usize {
+        2 * self.num_sequences
+    }
+
+    fn num_public_values(&self) -> usize {
+        2 * self.num_sequences
+    }
+}
+
+impl<AB: AirBuilder> Air<AB> for P3FibonacciAir {
+    fn eval(&self, builder: &mut AB) {
+        let main = builder.main();
+        let local = main.current_slice();
+        let next = main.next_slice();
+
+        // Collect (left, right, next_left, next_right, a, b) per sequence so that
+        // `pis`'s borrow on `builder` can end before we mutate `builder`.
+        let rows: Vec<(
+            AB::Var,
+            AB::Var,
+            AB::Var,
+            AB::Var,
+            AB::PublicVar,
+            AB::PublicVar,
+        )> = {
+            let pis = builder.public_values();
+            (0..self.num_sequences)
+                .map(|seq| {
+                    (
+                        local[2 * seq],
+                        local[2 * seq + 1],
+                        next[2 * seq],
+                        next[2 * seq + 1],
+                        pis[2 * seq],
+                        pis[2 * seq + 1],
+                    )
+                })
+                .collect()
+        };
+        drop(main);
+
+        for (left, right, next_left, next_right, a, b) in rows {
+            // Boundary: first row pins (left, right) = (a, b)
+            let mut when_first_row = builder.when_first_row();
+            when_first_row.assert_eq(left, a);
+            when_first_row.assert_eq(right, b);
+
+            let mut when_transition = builder.when_transition();
+            // Advance two Lambda rows per Plonky3 row.
+            when_transition.assert_eq(next_left, left + right);
+            when_transition.assert_eq(next_right, right + next_left);
+        }
+    }
+}
+
+/// Generates a Fibonacci trace for Plonky3.
+///
+/// For `num_sequences` sequences and `num_rows` rows (must be power of 2),
+/// produces a `RowMajorMatrix` with `2 * num_sequences` columns.
+/// Use `rows_for_lambda_trace(lambda_trace_length)` when comparing against
+/// Lambda's one-column-per-sequence trace.
+///
+/// Each sequence `s` starts with initial values matching Lambda's
+/// `create_initial_values()`: `left = s + 1`, `right = s + 2`.
+pub fn generate_fibonacci_trace(
+    num_sequences: usize,
+    num_rows: usize,
+) -> RowMajorMatrix<Goldilocks> {
+    assert!(num_rows.is_power_of_two(), "num_rows must be a power of 2");
+    let width = 2 * num_sequences;
+    let mut values = vec![Goldilocks::ZERO; width * num_rows];
+
+    for seq in 0..num_sequences {
+        let mut left = Goldilocks::from_u64((seq + 1) as u64);
+        let mut right = Goldilocks::from_u64((seq + 2) as u64);
+
+        for row in 0..num_rows {
+            values[row * width + 2 * seq] = left;
+            values[row * width + 2 * seq + 1] = right;
+            let next_left = left + right;
+            let next_right = right + next_left;
+            left = next_left;
+            right = next_right;
+        }
+    }
+
+    RowMajorMatrix::new(values, width)
+}
+
+/// Returns the number of packed Plonky3 rows for a Lambda trace length.
+pub fn rows_for_lambda_trace(lambda_trace_length: usize) -> usize {
+    assert!(
+        lambda_trace_length >= 2,
+        "lambda_trace_length must contain at least two rows"
+    );
+    assert!(
+        lambda_trace_length.is_power_of_two(),
+        "lambda_trace_length must be a power of 2"
+    );
+    lambda_trace_length / 2
+}
+
+/// Builds public values matching `generate_fibonacci_trace`'s initial values:
+/// `[a_0, b_0, a_1, b_1, ...] = [1, 2, 2, 3, 3, 4, ...]`
+pub fn public_values(num_sequences: usize) -> Vec<Goldilocks> {
+    let mut pis = Vec::with_capacity(2 * num_sequences);
+    for seq in 0..num_sequences {
+        pis.push(Goldilocks::from_u64((seq + 1) as u64));
+        pis.push(Goldilocks::from_u64((seq + 2) as u64));
+    }
+    pis
+}

From 1fd1a58ec2edabc76c8f08aaeeffc3f8340ee524 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 16 Apr 2026 18:10:01 -0300
Subject: [PATCH 16/34] lint

---
 bench_vs_plonky3/benches/stark_comparison.rs  | 36 +++++++------------
 bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 12 ++-----
 bench_vs_plonky3/src/lib.rs                   | 24 +++++++------
 bench_vs_plonky3/src/plonky3_fibonacci.rs     | 21 ++++++-----
 4 files changed, 40 insertions(+), 53 deletions(-)

diff --git a/bench_vs_plonky3/benches/stark_comparison.rs b/bench_vs_plonky3/benches/stark_comparison.rs
index fd90ae7b5..577664892 100644
--- a/bench_vs_plonky3/benches/stark_comparison.rs
+++ b/bench_vs_plonky3/benches/stark_comparison.rs
@@ -47,9 +47,7 @@ fn lambda_initial_values() -> Vec<(FE, FE)> {
 
 fn bench_lambda_prove(c: &mut Criterion) {
     let mut group = c.benchmark_group("lambda_stark/prove");
-    group.throughput(Throughput::Elements(
-        (ROWS * 2 * NUM_SEQUENCES) as u64,
-    ));
+    group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64));
     let proof_options = benchmark_proof_options();
 
     group.bench_with_input(
@@ -59,16 +57,13 @@ fn bench_lambda_prove(c: &mut Criterion) {
             b.iter_with_setup(
                 || {
                     let initial_values = lambda_initial_values();
-                    let trace = lambda_fibonacci_pair::compute_trace::<F, E>(
-                        &initial_values,
-                        rows,
-                    );
-                    let pub_inputs =
-                        lambda_fibonacci_pair::create_public_inputs(initial_values);
-                    let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
-                        &proof_options,
-                        NUM_SEQUENCES,
-                    );
+                    let trace = lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
+                    let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values);
+                    let air =
+                        lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
+                            &proof_options,
+                            NUM_SEQUENCES,
+                        );
                     (trace, pub_inputs, air)
                 },
                 |(mut trace, pub_inputs, air)| {
@@ -88,9 +83,7 @@ fn bench_lambda_prove(c: &mut Criterion) {
 
 fn bench_plonky3_prove(c: &mut Criterion) {
     let mut group = c.benchmark_group("plonky3_stark/prove");
-    group.throughput(Throughput::Elements(
-        (ROWS * 2 * NUM_SEQUENCES) as u64,
-    ));
+    group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64));
 
     group.bench_with_input(
         BenchmarkId::new("fibonacci", TRACE_LABEL),
@@ -102,8 +95,7 @@ fn bench_plonky3_prove(c: &mut Criterion) {
                     let air = plonky3_fibonacci::P3FibonacciAir {
                         num_sequences: NUM_SEQUENCES,
                     };
-                    let trace =
-                        plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, rows);
+                    let trace = plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, rows);
                     let pis = plonky3_fibonacci::public_values(NUM_SEQUENCES);
                     (config, air, trace, pis)
                 },
@@ -116,9 +108,7 @@ fn bench_plonky3_prove(c: &mut Criterion) {
 
 fn bench_lambda_verify(c: &mut Criterion) {
     let mut group = c.benchmark_group("lambda_stark/verify");
-    group.throughput(Throughput::Elements(
-        (ROWS * 2 * NUM_SEQUENCES) as u64,
-    ));
+    group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64));
     let proof_options = benchmark_proof_options();
 
     let initial_values = lambda_initial_values();
@@ -150,9 +140,7 @@ fn bench_lambda_verify(c: &mut Criterion) {
 
 fn bench_plonky3_verify(c: &mut Criterion) {
     let mut group = c.benchmark_group("plonky3_stark/verify");
-    group.throughput(Throughput::Elements(
-        (ROWS * 2 * NUM_SEQUENCES) as u64,
-    ));
+    group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64));
 
     let air = plonky3_fibonacci::P3FibonacciAir {
         num_sequences: NUM_SEQUENCES,
diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
index 2f1fd4990..54c704976 100644
--- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
+++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
@@ -78,11 +78,7 @@ where
         1
     }
 
-    fn evaluate(
-        &self,
-        eval_ctx: &TransitionEvaluationContext<F, E>,
-        out: &mut [FieldElement<E>],
-    ) {
+    fn evaluate(&self, eval_ctx: &TransitionEvaluationContext<F, E>, out: &mut [FieldElement<E>]) {
         match eval_ctx {
             TransitionEvaluationContext::Prover { frame, .. } => {
                 let s0 = frame.get_evaluation_step(0);
@@ -151,11 +147,7 @@ where
         1
     }
 
-    fn evaluate(
-        &self,
-        eval_ctx: &TransitionEvaluationContext<F, E>,
-        out: &mut [FieldElement<E>],
-    ) {
+    fn evaluate(&self, eval_ctx: &TransitionEvaluationContext<F, E>, out: &mut [FieldElement<E>]) {
         match eval_ctx {
             TransitionEvaluationContext::Prover { frame, .. } => {
                 let s0 = frame.get_evaluation_step(0);
diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs
index 224ad5fa9..31e9ef470 100644
--- a/bench_vs_plonky3/src/lib.rs
+++ b/bench_vs_plonky3/src/lib.rs
@@ -39,8 +39,7 @@ mod tests {
             .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
             .collect();
 
-        let mut trace =
-            lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, trace_length);
+        let mut trace = lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, trace_length);
         let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values);
         let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
             &proof_options,
@@ -88,8 +87,7 @@ mod tests {
             .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
             .collect();
 
-        let mut trace =
-            lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
+        let mut trace = lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
         let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values);
         let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::<F, E>::with_num_sequences(
             &proof_options,
@@ -218,8 +216,9 @@ mod tests {
             results: SpanResults,
         }
 
-        impl<S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>>
-            tracing_subscriber::Layer<S> for P3TimingLayer
+        impl<
+            S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>,
+        > tracing_subscriber::Layer<S> for P3TimingLayer
         {
             fn on_new_span(
                 &self,
@@ -249,8 +248,7 @@ mod tests {
                 id: tracing::span::Id,
                 _ctx: tracing_subscriber::layer::Context<'_, S>,
             ) {
-                if let Some((name, Some(start))) =
-                    self.spans.lock().unwrap().remove(&id.into_u64())
+                if let Some((name, Some(start))) = self.spans.lock().unwrap().remove(&id.into_u64())
                 {
                     let ms = start.elapsed().as_secs_f64() * 1000.0;
                     self.results.lock().unwrap().push((name, ms));
@@ -287,7 +285,12 @@ mod tests {
         span_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
         for (name, ms) in &span_data {
             if *ms >= 0.1 {
-                println!("  {:.<40} {:>8.1}ms  ({:.0}%)", name, ms, ms / total_ms * 100.0);
+                println!(
+                    "  {:.<40} {:>8.1}ms  ({:.0}%)",
+                    name,
+                    ms,
+                    ms / total_ms * 100.0
+                );
             }
         }
         let accounted: f64 = span_data.iter().map(|(_, ms)| ms).sum();
@@ -314,8 +317,7 @@ mod tests {
             .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64)))
             .collect();
 
-        let lambda_trace =
-            lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
+        let lambda_trace = lambda_fibonacci_pair::compute_trace::<F, E>(&initial_values, rows);
         let p3_trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows);
 
         assert_eq!(p3_trace.width, 2 * num_sequences);
diff --git a/bench_vs_plonky3/src/plonky3_fibonacci.rs b/bench_vs_plonky3/src/plonky3_fibonacci.rs
index c55bca8c5..b1f0816eb 100644
--- a/bench_vs_plonky3/src/plonky3_fibonacci.rs
+++ b/bench_vs_plonky3/src/plonky3_fibonacci.rs
@@ -39,6 +39,18 @@ impl<F: PrimeCharacteristicRing> BaseAir<F> for P3FibonacciAir {
     }
 }
 
+/// One sequence's (local_left, local_right, next_left, next_right, a, b)
+/// snapshot extracted from an `AirBuilder`. Factored out to keep the
+/// `Air::eval` signature readable (clippy::type_complexity).
+type FibPairRow<AB> = (
+    <AB as AirBuilder>::Var,
+    <AB as AirBuilder>::Var,
+    <AB as AirBuilder>::Var,
+    <AB as AirBuilder>::Var,
+    <AB as AirBuilder>::PublicVar,
+    <AB as AirBuilder>::PublicVar,
+);
+
 impl<AB: AirBuilder> Air<AB> for P3FibonacciAir {
     fn eval(&self, builder: &mut AB) {
         let main = builder.main();
@@ -47,14 +59,7 @@ impl<AB: AirBuilder> Air<AB> for P3FibonacciAir {
 
         // Collect (left, right, next_left, next_right, a, b) per sequence so that
         // `pis`'s borrow on `builder` can end before we mutate `builder`.
-        let rows: Vec<(
-            AB::Var,
-            AB::Var,
-            AB::Var,
-            AB::Var,
-            AB::PublicVar,
-            AB::PublicVar,
-        )> = {
+        let rows: Vec<FibPairRow<AB>> = {
             let pis = builder.public_values();
             (0..self.num_sequences)
                 .map(|seq| {

From c9f9df99fbb5c0baeee00ca1d9fb5bbbb6f397a7 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 16 Apr 2026 19:11:16 -0300
Subject: [PATCH 17/34] Replace summary.md with metrics.txt in bench_vs_plonky3
 and add README

---
 .github/workflows/bench-vs-p3-nightly.yml |  3 --
 bench_vs_plonky3/run.sh                   | 59 +++++++++++++----------
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml
index b8602d7d4..f7856a139 100644
--- a/.github/workflows/bench-vs-p3-nightly.yml
+++ b/.github/workflows/bench-vs-p3-nightly.yml
@@ -46,6 +46,3 @@ jobs:
           name: bench-vs-p3-nightly-${{ github.run_number }}-${{ github.sha }}
           path: bench_vs_p3_artifacts
           retention-days: 90
-
-      - name: Publish summary
-        run: cat bench_vs_p3_artifacts/summary.md >> "$GITHUB_STEP_SUMMARY"
diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index 01e3a5306..445d2bf14 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -344,11 +344,13 @@ for i in "${!RESULT_LOG_ROWS[@]}"; do
     rt="${RESULT_RATIO[$i]}"
     if $RUN_LAMBDA && $RUN_P3; then
         color=$GREEN
+        verdict="Lambda faster"
         if awk -v l="$lt" -v p="$pt" 'BEGIN{ exit !(l+0 > p+0) }'; then
             color=$RED
+            verdict="P3 faster"
         fi
-        printf "  %-9s  %-12s  %13ss  %13ss  ${color}%9sx${NC}\n" \
-            "$lr" "$rows" "$lt" "$pt" "$rt"
+        printf "  %-9s  %-12s  %13ss  %13ss  ${color}%9sx${NC}  (${color}%s${NC})\n" \
+            "$lr" "$rows" "$lt" "$pt" "$rt" "$verdict"
     elif $RUN_LAMBDA; then
         printf "  %-9s  %-12s  %13ss\n" "$lr" "$rows" "$lt"
     else
@@ -358,7 +360,8 @@ done
 
 echo ""
 if $RUN_LAMBDA && $RUN_P3; then
-    echo -e "Timing window: single-shot end-to-end prove. Ratio < 1 → Lambda faster."
+    echo -e "Timing window: single-shot end-to-end prove."
+    echo -e "Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster."
 fi
 if $NO_P3_PATCH; then
     echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2."
@@ -368,6 +371,16 @@ fi
 # --- Machine-readable report ------------------------------------------------
 
 if [ -n "$REPORT_DIR" ]; then
+    # Slash-joined helpers for metrics.txt (mirrors the format used by
+    # bench_vs/run.sh).
+    join_slash() {
+        local joined=""
+        for value in "$@"; do
+            joined="${joined:+$joined/}$value"
+        done
+        printf "%s\n" "$joined"
+    }
+
     {
         printf "log_rows\trows\tlambda_median_s\tp3_median_s\tratio_lambda_over_p3\truns\n"
         for i in "${!RESULT_LOG_ROWS[@]}"; do
@@ -382,29 +395,23 @@ if [ -n "$REPORT_DIR" ]; then
     } > "$REPORT_DIR/results.tsv"
 
     {
-        echo "# Lambda STARK vs Plonky3 Benchmark"
-        echo
-        echo "Timing window: \`single-shot end-to-end prove\` (no verification)."
-        echo "num-sequences: \`$NUM_SEQUENCES\`, columns: \`$((2 * NUM_SEQUENCES))\`, blowup: 2, fri_queries: 219, grinding: 0."
-        echo "runs per size: \`$RUNS\` (median reported)."
-        echo "arch: \`$(uname -m)\`, scalar mode: \`$($SCALAR && echo on || echo off)\`."
+        echo "arch=$(uname -m)"
+        echo "num_sequences=$NUM_SEQUENCES"
+        echo "columns=$((2 * NUM_SEQUENCES))"
+        echo "blowup=2"
+        echo "fri_queries=219"
+        echo "grinding=0"
+        echo "runs_per_size=$RUNS"
+        echo "p3_extension=$($NO_P3_PATCH && echo 'degree2_vanilla' || echo 'degree3_patched')"
+        echo "scalar=$($SCALAR && echo on || echo off)"
         if $SCALAR && [ -n "$SCALAR_RUSTFLAGS" ]; then
-            echo "RUSTFLAGS: \`$SCALAR_RUSTFLAGS\`."
-        fi
-        if $NO_P3_PATCH; then
-            echo
-            echo "> Plonky3 built without the vendored degree-3 patch: Challenge type is degree-2 (vanilla crates.io p3-goldilocks 0.5.2). Lambda still uses degree 3."
+            echo "rustflags=$SCALAR_RUSTFLAGS"
         fi
-        echo
-        echo "| log-rows | rows | Lambda (s) | P3 (s) | Lambda / P3 |"
-        echo "|---------:|-----:|-----------:|-------:|------------:|"
-        for i in "${!RESULT_LOG_ROWS[@]}"; do
-            printf "| %s | %s | %s | %s | %s |\n" \
-                "${RESULT_LOG_ROWS[$i]}" \
-                "${RESULT_ROWS[$i]}" \
-                "${RESULT_LAMBDA[$i]}" \
-                "${RESULT_P3[$i]}" \
-                "${RESULT_RATIO[$i]}"
-        done
-    } > "$REPORT_DIR/summary.md"
+        echo "timing_window=single_shot_end_to_end_prove_no_verify"
+        echo "log_rows_series=$(join_slash "${RESULT_LOG_ROWS[@]}")"
+        echo "rows_series=$(join_slash "${RESULT_ROWS[@]}")"
+        echo "lambda_medians=$(join_slash "${RESULT_LAMBDA[@]}")"
+        echo "p3_medians=$(join_slash "${RESULT_P3[@]}")"
+        echo "ratios_lambda_over_p3=$(join_slash "${RESULT_RATIO[@]}")"
+    } > "$REPORT_DIR/metrics.txt"
 fi

From ba4c7cd71b20341e53674bc6e977c66d625bde30 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 16 Apr 2026 19:33:44 -0300
Subject: [PATCH 18/34] remove line

---
 bench_vs_plonky3/run.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index 445d2bf14..550a83a77 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -303,13 +303,13 @@ for lr in "${LOG_ROWS[@]}"; do
     if $RUN_LAMBDA; then
         echo -ne "  ${GREEN}[lambda]${NC} "
         lambda_median=$(run_prover lambda "$lr")
-        echo "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")"
+        echo -e "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")"
     fi
 
     if $RUN_P3; then
         echo -ne "  ${GREEN}[p3]${NC}     "
         p3_median=$(run_prover p3 "$lr")
-        echo "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")"
+        echo -e "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")"
     fi
 
     local_ratio="n/a"
@@ -361,7 +361,6 @@ done
 echo ""
 if $RUN_LAMBDA && $RUN_P3; then
     echo -e "Timing window: single-shot end-to-end prove."
-    echo -e "Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster."
 fi
 if $NO_P3_PATCH; then
     echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2."

From f7fe4d3e0750e773049d7ae81ac97ee432a9b3c7 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 16 Apr 2026 19:37:52 -0300
Subject: [PATCH 19/34] add README

---
 bench_vs_plonky3/README.md | 154 +++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 bench_vs_plonky3/README.md

diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md
new file mode 100644
index 000000000..04260876f
--- /dev/null
+++ b/bench_vs_plonky3/README.md
@@ -0,0 +1,154 @@
+# Lambda STARK vs Plonky3 Benchmark
+
+Compares **single-shot end-to-end proving time** for an identical multi-sequence
+Fibonacci AIR. Complements `bench_vs/` (which compares Lambda VM vs SP1 on a
+full guest program) by isolating the STARK prover — no VM execution, no trace
+builder, just one AIR and two provers.
+
+## What is measured
+
+Both provers prove the same AIR:
+
+- **Columns** = `2 × num_sequences` (default 16 sequences → 32 columns).
+- **Rows** = `2 ^ log_rows` (default `19` → 524 288 rows).
+- **Blowup** = 2 (matches Lambda production `GoldilocksCubicProofOptions::with_blowup(2)`).
+- **FRI queries** = 219, grinding = 0.
+
+The timing window on both sides is **`Instant::now()` around `prove`, no
+verification, no proof serialization**:
+
+| Phase                                | Lambda STARK | Plonky3 |
+|--------------------------------------|:------------:|:-------:|
+| Build AIR + trace                    | ❌ (outside) | ❌ (outside) |
+| Build public inputs                  | ❌ (outside) | ❌ (outside) |
+| Prove (Round 1 → Round 4)            | ✅          | ✅ (`p3_uni_stark::prove`) |
+| Proof serialize / disk write         | ❌          | ❌ |
+| Verify                               | ❌          | ❌ |
+
+Lambda's trace, public inputs, and AIR are constructed via
+`lambda_fibonacci_pair::{compute_trace, create_public_inputs, FibonacciPairMultiColAIR}`.
+Plonky3's counterpart uses `plonky3_fibonacci::{P3FibonacciAir, generate_fibonacci_trace, public_values}`
+with `plonky3_config::matched_params_config`. Both AIRs are **cell-by-cell
+equivalent** — this is asserted by the `lambda_pair_trace_matches_plonky3_trace`
+test.
+
+## Prerequisites
+
+- Rust stable (the crate builds with `cargo build --release`).
+- No SP1 toolchain needed — there's no VM guest compilation.
+- For `--no-p3-patch` mode: a network-reachable crates.io (the script pulls
+  vanilla `p3-goldilocks 0.5.2` on demand).
+- For default mode (with the degree-3 patch): the vendored crate at
+  `bench_vs_plonky3/p3-goldilocks-patched/` and the root `[patch.crates-io]`
+  entry pointing at it.
+
+## Usage
+
+```bash
+# Default: log-rows=19, num-sequences=16, runs=3, with degree-3 patch, no scalar
+./bench_vs_plonky3/run.sh
+
+# Size sweep
+./bench_vs_plonky3/run.sh --log-rows 17 18 19 20
+
+# Single prover
+./bench_vs_plonky3/run.sh --lambda-only
+./bench_vs_plonky3/run.sh --p3-only
+
+# Nightly-equivalent (vanilla P3 degree-2, scalar on both sides)
+./bench_vs_plonky3/run.sh --no-p3-patch --scalar
+
+# Write machine-readable artifacts
+./bench_vs_plonky3/run.sh --report-dir /tmp/p3_report --no-color
+```
+
+### Flags
+
+| Flag | Default | Effect |
+|---|---|---|
+| `--log-rows K [K ...]` | `19` | One or more power-of-2 row counts. |
+| `--num-sequences N` | `16` | Number of Fibonacci sequences (columns = `2 × N`). |
+| `--runs N` | `3` | Runs per `(size, prover)`; median is reported. |
+| `--lambda-only` / `--p3-only` | both | Restrict to a single prover. |
+| `--report-dir DIR` | — | Write TSV + metrics + raw stdouts. |
+| `--no-p3-patch` | off | Comment the root `[patch.crates-io]` before building and restore on exit. Plonky3 compiles against vanilla crates.io `p3-goldilocks 0.5.2` (`BinomialExtensionField<Val, 2>`). Lambda still runs degree 3 — the extension fields differ across sides but the AIRs stay identical. |
+| `--scalar` | off | Pin `RUSTFLAGS` to disable SIMD on both sides. On `x86_64` drops AVX2 and AVX-512 (Goldilocks + most of Keccak go scalar, SSE2 residual on `p3-keccak`). On `aarch64` drops the `sha3` ISA extension (Keccak accelerator). |
+| `--no-color` | off | Disable ANSI colors. |
+| `-h` / `--help` | — | Print usage. |
+
+## Output
+
+Stdout (without `--report-dir`):
+
+```
+=== STARK prove benchmark: Lambda vs Plonky3 ===
+  log-rows:       19
+  num-sequences:  16  (columns = 32)
+  runs/size:      3  (median reported)
+  p3 extension:   degree 2 (vanilla, no patch)
+  scalar mode:    on  (arch=x86_64, RUSTFLAGS="-C target-feature=-avx2,-avx512f")
+
+[build] prove_bench
+--- log-rows=19  (rows = 524288) ---
+  [lambda] median 2.444s from 3 runs: 2.444,2.279,2.830
+  [p3]     median 0.988s from 3 runs: 0.981,0.993,0.988
+
+=== Summary ===
+  log-rows   rows              Lambda (s)          P3 (s)        L/P3
+  --------   ----              ----------          ------        ----
+  19         524288                2.444s          0.988s      2.474x  (P3 faster)
+
+Timing window: single-shot end-to-end prove.
+Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster.
+```
+
+With `--report-dir DIR` the script writes:
+
+- `results.tsv` — tab-separated raw data (`log_rows, rows, lambda_median_s,
+  p3_median_s, ratio_lambda_over_p3, runs`).
+- `metrics.txt` — key=value pairs with the config used (arch, scalar flag,
+  extension degree, blowup, queries, runs, rustflags) and the per-series
+  values slash-joined (so post-processing scripts can split easily).
+- `raw/` — per-invocation stdouts (`{prover}_log{K}_run{i}.stdout`).
+
+No markdown file is generated — the TSV is the single source of truth for
+downstream tooling.
+
+## Nightly
+
+A GitHub Actions workflow (`.github/workflows/bench-vs-p3-nightly.yml`) runs
+daily at 07:30 UTC (04:30 Buenos Aires, after the SP1 nightly completes) on
+the self-hosted `bench` runner. It executes:
+
+```bash
+bash ./bench_vs_plonky3/run.sh \
+  --log-rows 19 \
+  --num-sequences 16 \
+  --runs 3 \
+  --no-p3-patch \
+  --scalar \
+  --report-dir bench_vs_p3_artifacts \
+  --no-color
+```
+
+The `bench_vs_p3_artifacts/` directory is uploaded as an artifact named
+`bench-vs-p3-nightly-<run_number>-<sha>` with 90-day retention.
+
+## Notes on fairness
+
+- **Extension field**: default mode uses the vendored `p3-goldilocks-patched`
+  (`BinomiallyExtendable<3>`, same `x^3 - 2` as Lambda). `--no-p3-patch` falls
+  back to upstream degree-2 — Lambda still runs degree-3, so the sides differ.
+  The nightly runs in the degree-2 mode to track the "shipped P3 vs shipped
+  Lambda" comparison.
+- **Parallelism**: both provers are multi-threaded by default. Lambda pulls
+  rayon via `stark/parallel`; Plonky3 pulls rayon via
+  `p3-uni-stark` / `p3-dft` (hardcoded `features = ["parallel"]`, always on).
+- **SIMD**: without `--scalar`, each side uses whatever target-features the
+  compiler decides from the host CPU. `--scalar` equalises Goldilocks on
+  `x86_64` (no AVX2/AVX-512) or disables the ARMv8.4 SHA3 Keccak extension on
+  `aarch64`. `p3-keccak`'s SSE2 path on x86 is not disabled.
+- **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both
+  sides. Security models differ (Lambda: Johnson-bound, ~108 bits; P3:
+  conjectured, ~192 bits) — the compute work is equivalent, the claimed
+  soundness is not. See `ANALYSIS_LOG.md` for the full fairness audit.

From e72772b5542474633e420bbcf8bb8f9dfd74a951 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 16 Apr 2026 19:46:23 -0300
Subject: [PATCH 20/34]  Fix --no-p3-patch cleanup,

---
 bench_vs_plonky3/run.sh | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index 550a83a77..a0ace698d 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -127,12 +127,22 @@ fi
 # p3-goldilocks-patched (adds BinomiallyExtendable<3>, disables NEON). For the
 # nightly we build against vanilla crates.io p3-goldilocks — we comment the
 # block out and drop the `p3-degree3` feature.
+#
+# Both Cargo.toml AND Cargo.lock are backed up before the build: dropping the
+# patch makes cargo re-resolve p3-goldilocks against crates.io, which rewrites
+# Cargo.lock. The trap restores both so the working tree is clean on exit.
 CARGO_TOML="$ROOT_DIR/Cargo.toml"
+CARGO_LOCK="$ROOT_DIR/Cargo.lock"
 CARGO_TOML_BAK=""
+CARGO_LOCK_BAK=""
 BUILD_FEATURE_FLAGS=()
 if $NO_P3_PATCH; then
     CARGO_TOML_BAK="$CARGO_TOML.bak.p3bench.$$"
     cp "$CARGO_TOML" "$CARGO_TOML_BAK"
+    if [ -f "$CARGO_LOCK" ]; then
+        CARGO_LOCK_BAK="$CARGO_LOCK.bak.p3bench.$$"
+        cp "$CARGO_LOCK" "$CARGO_LOCK_BAK"
+    fi
     # Comment the [patch.crates-io] block and its entries (until the next blank
     # line or next [section]).
     python3 - "$CARGO_TOML" <<'PY'
@@ -161,7 +171,7 @@ for ln in lines:
         out.append(ln)
 path.write_text("".join(out))
 PY
-    trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi' EXIT INT TERM
+    trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi; if [ -n "$CARGO_LOCK_BAK" ] && [ -f "$CARGO_LOCK_BAK" ]; then mv "$CARGO_LOCK_BAK" "$CARGO_LOCK"; fi' EXIT INT TERM
     BUILD_FEATURE_FLAGS=(--no-default-features --features parallel)
 fi
 
@@ -219,7 +229,13 @@ cargo build --release -p bench-vs-plonky3 --bin prove_bench \
     --manifest-path "$ROOT_DIR/Cargo.toml" \
     ${BUILD_FEATURE_FLAGS[@]+"${BUILD_FEATURE_FLAGS[@]}"} 2>&1 | tail -5
 
-BIN="$ROOT_DIR/target/release/prove_bench"
+# Resolve the actual target directory via cargo metadata so we find the binary
+# whether cargo used ./target/ (default) or a custom CARGO_TARGET_DIR.
+TARGET_DIR=$(cargo metadata --manifest-path "$ROOT_DIR/Cargo.toml" \
+    --format-version 1 --no-deps 2>/dev/null \
+    | python3 -c 'import json, sys; print(json.load(sys.stdin)["target_directory"])' \
+    2>/dev/null || echo "$ROOT_DIR/target")
+BIN="$TARGET_DIR/release/prove_bench"
 if [ ! -x "$BIN" ]; then
     echo -e "${RED}[build] prove_bench not produced at $BIN${NC}"
     exit 1
@@ -393,7 +409,18 @@ if [ -n "$REPORT_DIR" ]; then
         done
     } > "$REPORT_DIR/results.tsv"
 
+    # Capture commit + timestamp so the artifact is self-describing.
+    git_sha="$(git -C "$ROOT_DIR" rev-parse HEAD 2>/dev/null || echo unknown)"
+    git_dirty="clean"
+    if ! git -C "$ROOT_DIR" diff --quiet HEAD -- 2>/dev/null; then
+        git_dirty="dirty"
+    fi
+    timestamp_utc="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+
     {
+        echo "timestamp_utc=$timestamp_utc"
+        echo "git_sha=$git_sha"
+        echo "git_tree=$git_dirty"
         echo "arch=$(uname -m)"
         echo "num_sequences=$NUM_SEQUENCES"
         echo "columns=$((2 * NUM_SEQUENCES))"

From ffefff4f9868a37b5f262d2854211ff48034d808 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Thu, 16 Apr 2026 20:16:05 -0300
Subject: [PATCH 21/34] Add breakdown section to README and match nightly size
 in instruments_breakdown

---
 bench_vs_plonky3/README.md  | 42 +++++++++++++++++++++++++++++++++++++
 bench_vs_plonky3/src/lib.rs |  2 +-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md
index 04260876f..727d6cce9 100644
--- a/bench_vs_plonky3/README.md
+++ b/bench_vs_plonky3/README.md
@@ -134,6 +134,48 @@ bash ./bench_vs_plonky3/run.sh \
 The `bench_vs_p3_artifacts/` directory is uploaded as an artifact named
 `bench-vs-p3-nightly-<run_number>-<sha>` with 90-day retention.
 
+## Breakdown (per-phase timing) for manual analysis
+
+The nightly only reports wall-clock totals. When you need to see *where* the
+time goes (constraint eval vs FFT vs FRI vs Merkle vs queries on the Lambda
+side, and the per-span breakdown on the Plonky3 side), run the
+`instruments_breakdown` test:
+
+```bash
+# x86_64 (server), Goldilocks scalar:
+RUSTFLAGS="-C target-feature=-avx2,-avx512f" \
+cargo test -p bench-vs-plonky3 --features instruments --release -- \
+  instruments_breakdown --nocapture
+
+# aarch64 (M1), 100% scalar:
+RUSTFLAGS="-C target-feature=-sha3" \
+cargo test -p bench-vs-plonky3 --features instruments --release -- \
+  instruments_breakdown --nocapture
+```
+
+- `--features instruments` activates `stark/instruments` — without it, the
+  per-phase timers are no-ops and the Lambda breakdown prints zeros.
+- `--release` is mandatory (debug numbers are meaningless).
+- `--nocapture` is required to see the output (`cargo test` swallows stdout
+  otherwise).
+- The test hardcodes `num_sequences = 16`, `rows = 1 << 19` (524 288), same
+  shape as the nightly, so the breakdown maps onto the nightly numbers.
+- Output is split in two sections:
+  - **Lambda**: explicit per-phase totals (Pre-pass / R1 Main commits / R1 Aux
+    build+commit / Rounds 2-4) plus sub-ops (Main LDE, Main Merkle, constraint
+    eval, decompose+extend, composition Merkle, OOD, deep comp, deep extend,
+    FRI commit, queries+open).
+  - **Plonky3**: every `tracing` span emitted at DEBUG during
+    `p3_uni_stark::prove`, sorted by wall-clock descending, filtered ≥ 0.1 ms.
+    Spans nest (e.g. `prove ⊃ compute_quotient_values`), so Σspans > total is
+    expected and not a bug. `(unaccounted)` can be negative from nesting.
+
+Details of every timer (which method it wraps, where it lives) are in
+[`INSTRUMENTATION.md`](INSTRUMENTATION.md).
+
+The nightly does **not** activate this path — it would add ~1 % overhead and
+pollute the historical wall-clock numbers.
+
 ## Notes on fairness
 
 - **Extension field**: default mode uses the vendored `p3-goldilocks-patched`
diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs
index 31e9ef470..d61c6ea9e 100644
--- a/bench_vs_plonky3/src/lib.rs
+++ b/bench_vs_plonky3/src/lib.rs
@@ -80,7 +80,7 @@ mod tests {
     #[test]
     fn instruments_breakdown() {
         let num_sequences = 16;
-        let rows = 1 << 18;
+        let rows = 1 << 19;
         let proof_options = benchmark_proof_options();
 
         let initial_values: Vec<(FE, FE)> = (0..num_sequences)

From e21518b09a6a7a87f943ddfc2ca4747df4374b08 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Sun, 19 Apr 2026 12:37:06 -0300
Subject: [PATCH 22/34] use fork

---
 .github/workflows/bench-vs-p3-nightly.yml     |    1 -
 Cargo.lock                                    |  231 +-
 Cargo.toml                                    |    8 -
 bench_vs_plonky3/Cargo.toml                   |   44 +-
 bench_vs_plonky3/INSTRUMENTATION.md           |   26 +-
 bench_vs_plonky3/README.md                    |   41 +-
 .../p3-goldilocks-patched/Cargo.toml          |  129 -
 .../benches/bench_field.rs                    |   72 -
 .../benches/extension.rs                      |   40 -
 .../src/aarch64_neon/mds.rs                   |  343 ---
 .../src/aarch64_neon/mod.rs                   |   12 -
 .../src/aarch64_neon/packing.rs               |  404 ---
 .../src/aarch64_neon/poseidon1.rs             |  716 -----
 .../src/aarch64_neon/poseidon1_asm.rs         |  843 ------
 .../src/aarch64_neon/poseidon2.rs             |  652 ----
 .../src/aarch64_neon/poseidon2_asm.rs         | 2621 -----------------
 .../src/aarch64_neon/utils.rs                 |  400 ---
 .../p3-goldilocks-patched/src/extension.rs    |  217 --
 .../p3-goldilocks-patched/src/goldilocks.rs   |  813 -----
 .../p3-goldilocks-patched/src/lib.rs          |   42 -
 .../p3-goldilocks-patched/src/mds.rs          |  761 -----
 .../p3-goldilocks-patched/src/poseidon1.rs    | 1143 -------
 .../p3-goldilocks-patched/src/poseidon2.rs    |  980 ------
 .../src/x86_64_avx2/mds.rs                    |   86 -
 .../src/x86_64_avx2/mod.rs                    |    3 -
 .../src/x86_64_avx2/packing.rs                |  539 ----
 .../src/x86_64_avx512/mds.rs                  |   86 -
 .../src/x86_64_avx512/mod.rs                  |    3 -
 .../src/x86_64_avx512/packing.rs              |  444 ---
 bench_vs_plonky3/run.sh                       |  112 +-
 bench_vs_plonky3/src/plonky3_config.rs        |   17 +-
 31 files changed, 171 insertions(+), 11658 deletions(-)
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs
 delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs

diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml
index f7856a139..d27bd9010 100644
--- a/.github/workflows/bench-vs-p3-nightly.yml
+++ b/.github/workflows/bench-vs-p3-nightly.yml
@@ -35,7 +35,6 @@ jobs:
             --log-rows 19 \
             --num-sequences 16 \
             --runs 3 \
-            --no-p3-patch \
             --scalar \
             --report-dir bench_vs_p3_artifacts \
             --no-color
diff --git a/Cargo.lock b/Cargo.lock
index ae5305254..98bdb17b4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -303,14 +303,14 @@ dependencies = [
  "p3-air",
  "p3-challenger",
  "p3-commit",
- "p3-dft 0.5.2",
- "p3-field 0.5.2",
+ "p3-dft 0.5.1",
+ "p3-field 0.5.1",
  "p3-fri",
  "p3-goldilocks",
  "p3-keccak",
- "p3-matrix 0.5.2",
+ "p3-matrix 0.5.1",
  "p3-merkle-tree",
- "p3-symmetric 0.5.2",
+ "p3-symmetric 0.5.1",
  "p3-uni-stark",
  "stark",
  "tracing",
@@ -2256,12 +2256,11 @@ dependencies = [
 
 [[package]]
 name = "p3-air"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f2ec9cbfc642fc5173817287c3f8b789d07743b5f7e812d058b7a03e344f9ab"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
- "p3-field 0.5.2",
- "p3-matrix 0.5.2",
+ "p3-field 0.5.1",
+ "p3-matrix 0.5.1",
  "tracing",
 ]
 
@@ -2282,30 +2281,28 @@ dependencies = [
 
 [[package]]
 name = "p3-challenger"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a0b490c745a7d2adeeafff06411814c8078c432740162332b3cd71be0158a76"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
- "p3-field 0.5.2",
- "p3-maybe-rayon 0.5.2",
+ "p3-field 0.5.1",
+ "p3-maybe-rayon 0.5.1",
  "p3-monty-31",
- "p3-symmetric 0.5.2",
- "p3-util 0.5.2",
+ "p3-symmetric 0.5.1",
+ "p3-util 0.5.1",
  "tracing",
 ]
 
 [[package]]
 name = "p3-commit"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "916ae7989d5c3b49f887f5c55b2f9826bdbb81aaebf834503c4145d8b267c829"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
  "p3-challenger",
- "p3-dft 0.5.2",
- "p3-field 0.5.2",
- "p3-matrix 0.5.2",
- "p3-util 0.5.2",
+ "p3-dft 0.5.1",
+ "p3-field 0.5.1",
+ "p3-matrix 0.5.1",
+ "p3-util 0.5.1",
  "serde",
 ]
 
@@ -2324,15 +2321,14 @@ dependencies = [
 
 [[package]]
 name = "p3-dft"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55301e91544440254977108b85c32c09d7ea05f2f0dd61092a2825339906a4a7"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
- "p3-field 0.5.2",
- "p3-matrix 0.5.2",
- "p3-maybe-rayon 0.5.2",
- "p3-util 0.5.2",
+ "p3-field 0.5.1",
+ "p3-matrix 0.5.1",
+ "p3-maybe-rayon 0.5.1",
+ "p3-util 0.5.1",
  "spin 0.10.0",
  "tracing",
 ]
@@ -2353,14 +2349,13 @@ dependencies = [
 
 [[package]]
 name = "p3-field"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85affca7fc983889f260655c4cf74163eebb94605f702e4b6809ead707cba54f"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
  "num-bigint 0.4.6",
- "p3-maybe-rayon 0.5.2",
- "p3-util 0.5.2",
+ "p3-maybe-rayon 0.5.1",
+ "p3-util 0.5.1",
  "paste",
  "rand 0.10.1",
  "serde",
@@ -2369,19 +2364,17 @@ dependencies = [
 
 [[package]]
 name = "p3-fri"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ac25574ed306b4c9ad1969faaecc0fe6081d45ad7e1ec236661a6e0e37b39e1"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
  "p3-challenger",
  "p3-commit",
- "p3-dft 0.5.2",
- "p3-field 0.5.2",
- "p3-interpolation",
- "p3-matrix 0.5.2",
- "p3-maybe-rayon 0.5.2",
- "p3-util 0.5.2",
+ "p3-dft 0.5.1",
+ "p3-field 0.5.1",
+ "p3-matrix 0.5.1",
+ "p3-maybe-rayon 0.5.1",
+ "p3-util 0.5.1",
  "rand 0.10.1",
  "serde",
  "spin 0.10.0",
@@ -2391,42 +2384,30 @@ dependencies = [
 
 [[package]]
 name = "p3-goldilocks"
-version = "0.5.2"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "num-bigint 0.4.6",
  "p3-challenger",
- "p3-dft 0.5.2",
- "p3-field 0.5.2",
- "p3-mds 0.5.2",
+ "p3-dft 0.5.1",
+ "p3-field 0.5.1",
+ "p3-mds 0.5.1",
  "p3-poseidon1",
- "p3-poseidon2 0.5.2",
- "p3-symmetric 0.5.2",
- "p3-util 0.5.2",
+ "p3-poseidon2 0.5.1",
+ "p3-symmetric 0.5.1",
+ "p3-util 0.5.1",
  "paste",
  "rand 0.10.1",
  "serde",
 ]
 
-[[package]]
-name = "p3-interpolation"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14fd48db63ff15f5e96dc46e6991dbc2d39431b82dcb154bad90f4579236e328"
-dependencies = [
- "p3-field 0.5.2",
- "p3-matrix 0.5.2",
- "p3-maybe-rayon 0.5.2",
- "p3-util 0.5.2",
-]
-
 [[package]]
 name = "p3-keccak"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebcf27615ece1995e4fcf4c69740f1cf515d1481367a20b4b3ce7f4f1b8d70f7"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
- "p3-symmetric 0.5.2",
- "p3-util 0.5.2",
+ "p3-symmetric 0.5.1",
+ "p3-util 0.5.1",
  "tiny-keccak",
 ]
 
@@ -2447,14 +2428,13 @@ dependencies = [
 
 [[package]]
 name = "p3-matrix"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53428126b009071563d1d07305a9de8be0d21de00b57d2475289ee32ffca6577"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
- "p3-field 0.5.2",
- "p3-maybe-rayon 0.5.2",
- "p3-util 0.5.2",
+ "p3-field 0.5.1",
+ "p3-maybe-rayon 0.5.1",
+ "p3-util 0.5.1",
  "rand 0.10.1",
  "serde",
  "tracing",
@@ -2468,9 +2448,8 @@ checksum = "c3968ad1160310296eb04f91a5f4edfa38fe1d6b2b8cd6b5c64e6f9b7370979e"
 
 [[package]]
 name = "p3-maybe-rayon"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "082bf467011c06c768c579ec6eb9accb5e1e62108891634cc770396e917f978a"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "rayon",
 ]
@@ -2492,30 +2471,28 @@ dependencies = [
 
 [[package]]
 name = "p3-mds"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35209e6214102ea6ec6b8cb1b9c15a9b8e597a39f9173597c957f123bced81b3"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
- "p3-dft 0.5.2",
- "p3-field 0.5.2",
- "p3-symmetric 0.5.2",
- "p3-util 0.5.2",
+ "p3-dft 0.5.1",
+ "p3-field 0.5.1",
+ "p3-symmetric 0.5.1",
+ "p3-util 0.5.1",
  "rand 0.10.1",
 ]
 
 [[package]]
 name = "p3-merkle-tree"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "182a5383a54c50f47866f819946d28d95262f69967902734de8fdecb0d70c774"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
  "p3-commit",
- "p3-field 0.5.2",
- "p3-matrix 0.5.2",
- "p3-maybe-rayon 0.5.2",
- "p3-symmetric 0.5.2",
- "p3-util 0.5.2",
+ "p3-field 0.5.1",
+ "p3-matrix 0.5.1",
+ "p3-maybe-rayon 0.5.1",
+ "p3-symmetric 0.5.1",
+ "p3-util 0.5.1",
  "rand 0.10.1",
  "serde",
  "thiserror 2.0.17",
@@ -2524,21 +2501,20 @@ dependencies = [
 
 [[package]]
 name = "p3-monty-31"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffa8c99ec50c035020bbf5457c6a729ba6a975719c1a8dd3f16421081e4f650c"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
  "num-bigint 0.4.6",
- "p3-dft 0.5.2",
- "p3-field 0.5.2",
- "p3-matrix 0.5.2",
- "p3-maybe-rayon 0.5.2",
- "p3-mds 0.5.2",
+ "p3-dft 0.5.1",
+ "p3-field 0.5.1",
+ "p3-matrix 0.5.1",
+ "p3-maybe-rayon 0.5.1",
+ "p3-mds 0.5.1",
  "p3-poseidon1",
- "p3-poseidon2 0.5.2",
- "p3-symmetric 0.5.2",
- "p3-util 0.5.2",
+ "p3-poseidon2 0.5.1",
+ "p3-symmetric 0.5.1",
+ "p3-util 0.5.1",
  "paste",
  "rand 0.10.1",
  "serde",
@@ -2548,12 +2524,11 @@ dependencies = [
 
 [[package]]
 name = "p3-poseidon1"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a018b618e3fa0aec8be933b1d8e404edd23f46991f6bf3f5c2f3f95e9413fe9"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
- "p3-field 0.5.2",
- "p3-symmetric 0.5.2",
+ "p3-field 0.5.1",
+ "p3-symmetric 0.5.1",
  "rand 0.10.1",
 ]
 
@@ -2573,14 +2548,13 @@ dependencies = [
 
 [[package]]
 name = "p3-poseidon2"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "256a668a9ba916f8767552f13d0ba50d18968bc74a623bfdafa41e2970c944d0"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
- "p3-field 0.5.2",
- "p3-mds 0.5.2",
- "p3-symmetric 0.5.2",
- "p3-util 0.5.2",
+ "p3-field 0.5.1",
+ "p3-mds 0.5.1",
+ "p3-symmetric 0.5.1",
+ "p3-util 0.5.1",
  "rand 0.10.1",
 ]
 
@@ -2597,30 +2571,28 @@ dependencies = [
 
 [[package]]
 name = "p3-symmetric"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c60a71a1507c13611b0f2b0b6e83669fd5b76f8e3115bcbced5ccfdf3ca7807"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
- "p3-field 0.5.2",
- "p3-util 0.5.2",
+ "p3-field 0.5.1",
+ "p3-util 0.5.1",
  "serde",
 ]
 
 [[package]]
 name = "p3-uni-stark"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c4ecaad8a7b4cf0fc711278c7a29fdc6d14239157866b17feaf14061834bc51"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "itertools 0.14.0",
  "p3-air",
  "p3-challenger",
  "p3-commit",
- "p3-field 0.5.2",
- "p3-matrix 0.5.2",
- "p3-maybe-rayon 0.5.2",
- "p3-util 0.5.2",
+ "p3-field 0.5.1",
+ "p3-matrix 0.5.1",
+ "p3-maybe-rayon 0.5.1",
+ "p3-util 0.5.1",
  "serde",
  "thiserror 2.0.17",
  "tracing",
@@ -2637,9 +2609,8 @@ dependencies = [
 
 [[package]]
 name = "p3-util"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8b766b9e9254bf3fa98d76e42cf8a5b30628c182dfd5272d270076ee12f0fc0"
+version = "0.5.1"
+source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a"
 dependencies = [
  "serde",
  "transpose",
diff --git a/Cargo.toml b/Cargo.toml
index 886c206f2..031606010 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,11 +19,3 @@ debug = true
 
 # For profiling with samply/perf, build with:
 #   CARGO_PROFILE_RELEASE_DEBUG=1 cargo build --release
-
-# Patched p3-goldilocks adds a BinomiallyExtendable<3> impl for degree-3
-# extension (same as Lambda's x^3 - 2) and disables NEON packing on aarch64.
-# Used only by bench_vs_plonky3 for apples-to-apples comparisons against
-# Lambda STARK. The nightly workflow comments this block out at CI time to
-# benchmark vanilla p3-goldilocks (degree-2 extension).
-[patch.crates-io]
-p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" }
diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
index a3d4e02e2..5b313106f 100644
--- a/bench_vs_plonky3/Cargo.toml
+++ b/bench_vs_plonky3/Cargo.toml
@@ -9,19 +9,24 @@ stark = { path = "../crypto/stark", features = ["test-utils"] }
 crypto = { path = "../crypto/crypto", features = ["std", "serde"] }
 math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"] }
 
-# Plonky3 (all 0.5.2)
-p3-air = "0.5.2"
-p3-field = "0.5.2"
-p3-goldilocks = "0.5.2"
-p3-matrix = "0.5.2"
-p3-commit = "0.5.2"
-p3-challenger = "0.5.2"
-p3-symmetric = "0.5.2"
-p3-merkle-tree = "0.5.2"
-p3-keccak = "0.5.2"
-p3-fri = "0.5.2"
-p3-uni-stark = { version = "0.5.2", features = ["parallel"] }
-p3-dft = { version = "0.5.2", features = ["parallel"] }
+# Plonky3: pinned to the yetanotherco fork, branch `feat/goldilocks_deg3`.
+# The branch adds BinomiallyExtendable<3> for Goldilocks (x^3 - 2), matching
+# Lambda's Degree3GoldilocksExtensionField. All p3-* crates MUST resolve to
+# the same git source + ref; declaring any of them as a crates.io dep would
+# pull in a second incompatible p3-field. cargo clones the fork once into
+# ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time.
+p3-air         = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-field       = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-goldilocks  = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-matrix      = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-commit      = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-challenger  = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-symmetric   = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-keccak      = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-fri         = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-uni-stark   = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] }
+p3-dft         = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] }
 
 # Tracing for P3 span-based profiling
 tracing = "0.1"
@@ -34,18 +39,11 @@ criterion = { version = "0.4", default-features = false }
 # Both provers run multi-threaded by default: Plonky3's `Radix2DitParallel` DFT
 # uses rayon unconditionally, so Lambda must also enable `parallel` for a fair
 # apples-to-apples comparison. Disable with `--no-default-features` to compare
-# single-threaded.
-#
-# `p3-degree3` (default on) selects the cubic extension for Plonky3's
-# Challenge type, matching Lambda's `Degree3GoldilocksExtensionField`. It
-# requires the root `[patch.crates-io]` pointing at p3-goldilocks-patched.
-# Disable it (`--no-default-features --features parallel`) together with
-# commenting the patch block to build against vanilla crates.io
-# p3-goldilocks (degree-2 extension).
-default = ["parallel", "p3-degree3"]
+# single-threaded. Cubic extension (`x^3 - 2`) matching Lambda is unconditional
+# — the fork ships `BinomiallyExtendable<3>` for Goldilocks natively.
+default = ["parallel"]
 parallel = ["stark/parallel"]
 instruments = ["stark/instruments"]
-p3-degree3 = []
 
 [[bin]]
 name = "prove_bench"
diff --git a/bench_vs_plonky3/INSTRUMENTATION.md b/bench_vs_plonky3/INSTRUMENTATION.md
index 0d82afe0e..b7b6bd4b1 100644
--- a/bench_vs_plonky3/INSTRUMENTATION.md
+++ b/bench_vs_plonky3/INSTRUMENTATION.md
@@ -10,14 +10,6 @@ El test que imprime el breakdown se llama `instruments_breakdown`. Hay que
 compilar con la feature `instruments` y pasar `--nocapture` porque la salida
 va a stdout (si no, `cargo test` se la come).
 
-**M1 (100% scalar, fairest):**
-
-```bash
-RUSTFLAGS="-C target-feature=-sha3" \
-cargo test -p bench-vs-plonky3 --features instruments --release -- \
-  instruments_breakdown --nocapture
-```
-
 **x86 (Goldilocks scalar, SSE2 Keccak residual en P3):**
 
 ```bash
@@ -189,15 +181,9 @@ timings y aparecen en logs distintos.
    tiempo fuera de `multi_prove` (construcción de AIR, setup).
 4. Los porcentajes de Plonky3 se calculan contra **`p3_prove_dur`** (solo el
    `prove`, sin setup).
-5. El benchmark usa **degree 3** para la extensión de Plonky3 *sólo* si el
-   root `Cargo.toml` mantiene:
-   ```toml
-   [patch.crates-io]
-   p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" }
-   ```
-   (línea 26). Sin ese patch, P3 usa la extensión degree 2 de upstream y la
-   comparación deja de ser fair.
-6. Plataforma:
-   - M1: `RUSTFLAGS="-C target-feature=-sha3"` → scalar en ambos lados.
-   - x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` → Goldilocks scalar,
-     residual SSE2 en Keccak de P3 (~7%).
+5. El benchmark usa **degree 3** para la extensión de Plonky3 vía git deps a
+   la rama `feat/goldilocks_deg3` del fork `yetanotherco/Plonky3` (ver
+   `bench_vs_plonky3/Cargo.toml`), que provee `BinomiallyExtendable<3>`
+   para Goldilocks con el mismo irreducible `x^3 - 2` que Lambda.
+6. Plataforma: x86 con `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` →
+   Goldilocks scalar, residual SSE2 en Keccak de P3 (~7%).
diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md
index 727d6cce9..fea3c8d7e 100644
--- a/bench_vs_plonky3/README.md
+++ b/bench_vs_plonky3/README.md
@@ -36,16 +36,16 @@ test.
 
 - Rust stable (the crate builds with `cargo build --release`).
 - No SP1 toolchain needed — there's no VM guest compilation.
-- For `--no-p3-patch` mode: a network-reachable crates.io (the script pulls
-  vanilla `p3-goldilocks 0.5.2` on demand).
-- For default mode (with the degree-3 patch): the vendored crate at
-  `bench_vs_plonky3/p3-goldilocks-patched/` and the root `[patch.crates-io]`
-  entry pointing at it.
+- Read access to `https://github.com/yetanotherco/Plonky3.git` (branch
+  `feat/goldilocks_deg3`). Cargo clones it into `~/.cargo/git/db` on the
+  first build and `Cargo.lock` pins the SHA. The branch provides
+  `BinomiallyExtendable<3>` for Goldilocks (`x^3 - 2`, matching Lambda's
+  `Degree3GoldilocksExtensionField`).
 
 ## Usage
 
 ```bash
-# Default: log-rows=19, num-sequences=16, runs=3, with degree-3 patch, no scalar
+# Default: log-rows=19, num-sequences=16, runs=3, cubic extension, no scalar
 ./bench_vs_plonky3/run.sh
 
 # Size sweep
@@ -55,8 +55,8 @@ test.
 ./bench_vs_plonky3/run.sh --lambda-only
 ./bench_vs_plonky3/run.sh --p3-only
 
-# Nightly-equivalent (vanilla P3 degree-2, scalar on both sides)
-./bench_vs_plonky3/run.sh --no-p3-patch --scalar
+# Scalar mode on both sides (x86_64 only — disables AVX2/AVX-512)
+./bench_vs_plonky3/run.sh --scalar
 
 # Write machine-readable artifacts
 ./bench_vs_plonky3/run.sh --report-dir /tmp/p3_report --no-color
@@ -71,8 +71,7 @@ test.
 | `--runs N` | `3` | Runs per `(size, prover)`; median is reported. |
 | `--lambda-only` / `--p3-only` | both | Restrict to a single prover. |
 | `--report-dir DIR` | — | Write TSV + metrics + raw stdouts. |
-| `--no-p3-patch` | off | Comment the root `[patch.crates-io]` before building and restore on exit. Plonky3 compiles against vanilla crates.io `p3-goldilocks 0.5.2` (`BinomialExtensionField<Val, 2>`). Lambda still runs degree 3 — the extension fields differ across sides but the AIRs stay identical. |
-| `--scalar` | off | Pin `RUSTFLAGS` to disable SIMD on both sides. On `x86_64` drops AVX2 and AVX-512 (Goldilocks + most of Keccak go scalar, SSE2 residual on `p3-keccak`). On `aarch64` drops the `sha3` ISA extension (Keccak accelerator). |
+| `--scalar` | off | Pin `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` so Goldilocks (and most of Keccak) run scalar on both sides. x86_64 only; on other archs the flag is ignored with a warning. Residual SSE2 on `p3-keccak` remains (~7% of total prove time). |
 | `--no-color` | off | Disable ANSI colors. |
 | `-h` / `--help` | — | Print usage. |
 
@@ -85,7 +84,7 @@ Stdout (without `--report-dir`):
   log-rows:       19
   num-sequences:  16  (columns = 32)
   runs/size:      3  (median reported)
-  p3 extension:   degree 2 (vanilla, no patch)
+  p3 extension:   degree 3 (forked p3-goldilocks, matches Lambda)
   scalar mode:    on  (arch=x86_64, RUSTFLAGS="-C target-feature=-avx2,-avx512f")
 
 [build] prove_bench
@@ -125,7 +124,6 @@ bash ./bench_vs_plonky3/run.sh \
   --log-rows 19 \
   --num-sequences 16 \
   --runs 3 \
-  --no-p3-patch \
   --scalar \
   --report-dir bench_vs_p3_artifacts \
   --no-color
@@ -144,11 +142,6 @@ side, and the per-span breakdown on the Plonky3 side), run the
 ```bash
 # x86_64 (server), Goldilocks scalar:
 RUSTFLAGS="-C target-feature=-avx2,-avx512f" \
-cargo test -p bench-vs-plonky3 --features instruments --release -- \
-  instruments_breakdown --nocapture
-
-# aarch64 (M1), 100% scalar:
-RUSTFLAGS="-C target-feature=-sha3" \
 cargo test -p bench-vs-plonky3 --features instruments --release -- \
   instruments_breakdown --nocapture
 ```
@@ -178,18 +171,16 @@ pollute the historical wall-clock numbers.
 
 ## Notes on fairness
 
-- **Extension field**: default mode uses the vendored `p3-goldilocks-patched`
-  (`BinomiallyExtendable<3>`, same `x^3 - 2` as Lambda). `--no-p3-patch` falls
-  back to upstream degree-2 — Lambda still runs degree-3, so the sides differ.
-  The nightly runs in the degree-2 mode to track the "shipped P3 vs shipped
-  Lambda" comparison.
+- **Extension field**: Plonky3 runs `BinomialExtensionField<Goldilocks, 3>`
+  with the same `x^3 - 2` irreducible as Lambda's
+  `Degree3GoldilocksExtensionField`. Both sides use the same cubic extension.
 - **Parallelism**: both provers are multi-threaded by default. Lambda pulls
   rayon via `stark/parallel`; Plonky3 pulls rayon via
   `p3-uni-stark` / `p3-dft` (hardcoded `features = ["parallel"]`, always on).
 - **SIMD**: without `--scalar`, each side uses whatever target-features the
-  compiler decides from the host CPU. `--scalar` equalises Goldilocks on
-  `x86_64` (no AVX2/AVX-512) or disables the ARMv8.4 SHA3 Keccak extension on
-  `aarch64`. `p3-keccak`'s SSE2 path on x86 is not disabled.
+  compiler decides from the host CPU. `--scalar` (x86_64 only) disables AVX2
+  and AVX-512 so Goldilocks arithmetic is scalar on both sides. `p3-keccak`'s
+  SSE2 path on x86 is not disabled.
 - **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both
   sides. Security models differ (Lambda: Johnson-bound, ~108 bits; P3:
   conjectured, ~192 bits) — the compute work is equivalent, the claimed
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml b/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml
deleted file mode 100644
index 768a2bb5a..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml
+++ /dev/null
@@ -1,129 +0,0 @@
-# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
-#
-# When uploading crates to the registry Cargo will automatically
-# "normalize" Cargo.toml files for maximal compatibility
-# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies.
-#
-# If you are reading this file be aware that the original Cargo.toml
-# will likely look very different (and much more reasonable).
-# See Cargo.toml.orig for the original contents.
-
-[package]
-edition = "2024"
-name = "p3-goldilocks"
-version = "0.5.2"
-build = false
-autolib = false
-autobins = false
-autoexamples = false
-autotests = false
-autobenches = false
-description = "An implementation of the Goldilocks prime field F_p, where p = 2^64 - 2^32 + 1."
-homepage = "https://github.com/Plonky3/Plonky3"
-readme = false
-keywords = [
-    "cryptography",
-    "SNARK",
-    "PLONK",
-    "FRI",
-    "plonky3",
-]
-categories = ["cryptography::cryptocurrencies"]
-license = "MIT OR Apache-2.0"
-repository = "https://github.com/Plonky3/Plonky3"
-resolver = "2"
-
-[lib]
-name = "p3_goldilocks"
-path = "src/lib.rs"
-
-[[bench]]
-name = "bench_field"
-path = "benches/bench_field.rs"
-harness = false
-
-[[bench]]
-name = "extension"
-path = "benches/extension.rs"
-harness = false
-
-[dependencies.num-bigint]
-version = "0.4.6"
-default-features = false
-
-[dependencies.p3-challenger]
-version = "0.5.2"
-
-[dependencies.p3-dft]
-version = "0.5.2"
-
-[dependencies.p3-field]
-version = "0.5.2"
-
-[dependencies.p3-mds]
-version = "0.5.2"
-
-[dependencies.p3-poseidon1]
-version = "0.5.2"
-
-[dependencies.p3-poseidon2]
-version = "0.5.2"
-
-[dependencies.p3-symmetric]
-version = "0.5.2"
-
-[dependencies.p3-util]
-version = "0.5.2"
-
-[dependencies.paste]
-version = "1.0.15"
-
-[dependencies.rand]
-version = "0.10.0"
-default-features = false
-
-[dependencies.serde]
-version = "1.0"
-features = ["derive"]
-default-features = false
-
-[dev-dependencies.criterion]
-version = "0.8"
-
-[dev-dependencies.proptest]
-version = "1.10"
-
-[dev-dependencies.rand]
-version = "0.10.0"
-default-features = false
-
-[lints.clippy]
-cognitive_complexity = "allow"
-match_bool = "warn"
-needless_pass_by_value = "warn"
-redundant_pub_crate = "allow"
-semicolon_if_nothing_returned = "warn"
-too_long_first_doc_paragraph = "allow"
-transmute_undefined_repr = "allow"
-tuple_array_conversions = "allow"
-unused_peekable = "allow"
-
-[lints.clippy.all]
-level = "warn"
-priority = -1
-
-[lints.clippy.nursery]
-level = "warn"
-priority = -1
-
-[lints.rust]
-rust_2024_incompatible_pat = "warn"
-unused_must_use = "deny"
-
-[lints.rust.rust_2018_idioms]
-level = "deny"
-priority = -1
-
-[lints.rustdoc]
-all = "warn"
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs
deleted file mode 100644
index a0d5e05f4..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-use core::any::type_name;
-
-use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
-use p3_field::{Field, PrimeCharacteristicRing};
-use p3_field_testing::bench_func::{
-    benchmark_add_latency, benchmark_add_throughput, benchmark_chunked_linear_combination,
-    benchmark_inv, benchmark_iter_sum, benchmark_sub_latency, benchmark_sub_throughput,
-};
-use p3_field_testing::{
-    benchmark_dot_array, benchmark_mul_latency, benchmark_mul_throughput, benchmark_sum_array,
-};
-use p3_goldilocks::Goldilocks;
-use rand::rngs::SmallRng;
-use rand::{RngExt, SeedableRng};
-
-type F = Goldilocks;
-
-fn bench_field(c: &mut Criterion) {
-    let name = "Goldilocks";
-    const REPS: usize = 200;
-    benchmark_mul_latency::<F, 100>(c, name);
-    benchmark_mul_throughput::<F, 25>(c, name);
-    benchmark_inv::<F>(c, name);
-    benchmark_iter_sum::<F, 4, REPS>(c, name);
-    benchmark_sum_array::<F, 4, REPS>(c, name);
-
-    benchmark_dot_array::<F, 1>(c, name);
-    benchmark_dot_array::<F, 2>(c, name);
-    benchmark_dot_array::<F, 3>(c, name);
-    benchmark_dot_array::<F, 4>(c, name);
-    benchmark_dot_array::<F, 5>(c, name);
-    benchmark_dot_array::<F, 6>(c, name);
-
-    // Note that each round of throughput has 10 operations
-    // So we should have 10 * more repetitions for latency tests.
-    const L_REPS: usize = 10 * REPS;
-    benchmark_add_latency::<F, L_REPS>(c, name);
-    benchmark_add_throughput::<F, REPS>(c, name);
-    benchmark_sub_latency::<F, L_REPS>(c, name);
-    benchmark_sub_throughput::<F, REPS>(c, name);
-
-    benchmark_chunked_linear_combination::<F, F, 100>(c, name);
-
-    let mut rng = SmallRng::seed_from_u64(1);
-    c.bench_function("7th_root", |b| {
-        b.iter_batched(
-            || rng.random::<F>(),
-            |x| x.exp_u64(10540996611094048183),
-            BatchSize::SmallInput,
-        );
-    });
-}
-fn bench_packedfield(c: &mut Criterion) {
-    let name = type_name::<<F as Field>::Packing>().to_string();
-    // Note that each round of throughput has 10 operations
-    // So we should have 10 * more repetitions for latency tests.
-    const REPS: usize = 100;
-    const L_REPS: usize = 10 * REPS;
-
-    benchmark_add_latency::<<F as Field>::Packing, L_REPS>(c, &name);
-    benchmark_add_throughput::<<F as Field>::Packing, REPS>(c, &name);
-    benchmark_sub_latency::<<F as Field>::Packing, L_REPS>(c, &name);
-    benchmark_sub_throughput::<<F as Field>::Packing, REPS>(c, &name);
-    benchmark_mul_latency::<<F as Field>::Packing, L_REPS>(c, &name);
-    benchmark_mul_throughput::<<F as Field>::Packing, REPS>(c, &name);
-
-    type PF = <F as Field>::Packing;
-    benchmark_chunked_linear_combination::<F, PF, 100>(c, &name);
-}
-
-criterion_group!(goldilocks_arithmetic, bench_field, bench_packedfield);
-criterion_main!(goldilocks_arithmetic);
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs
deleted file mode 100644
index f4bf7e750..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs
+++ /dev/null
@@ -1,40 +0,0 @@
-use criterion::{Criterion, criterion_group, criterion_main};
-use p3_field::extension::BinomialExtensionField;
-use p3_field_testing::bench_func::{
-    benchmark_inv, benchmark_mul_latency, benchmark_mul_throughput, benchmark_square,
-};
-use p3_field_testing::benchmark_mul;
-use p3_goldilocks::Goldilocks;
-
-type EF2 = BinomialExtensionField<Goldilocks, 2>;
-type EF5 = BinomialExtensionField<Goldilocks, 5>;
-
-// Note that each round of throughput has 10 operations
-// So we should have 10 * more repetitions for latency tests.
-const REPS: usize = 50;
-const L_REPS: usize = 10 * REPS;
-
-fn bench_quadratic_extension(c: &mut Criterion) {
-    let name = "BinomialExtensionField<Goldilocks, 2>";
-    benchmark_square::<EF2>(c, name);
-    benchmark_inv::<EF2>(c, name);
-    benchmark_mul::<EF2>(c, name);
-    benchmark_mul_throughput::<EF2, REPS>(c, name);
-    benchmark_mul_latency::<EF2, L_REPS>(c, name);
-}
-
-fn bench_quintic_extension(c: &mut Criterion) {
-    let name = "BinomialExtensionField<Goldilocks, 5>";
-    benchmark_square::<EF5>(c, name);
-    benchmark_inv::<EF5>(c, name);
-    benchmark_mul::<EF5>(c, name);
-    benchmark_mul_throughput::<EF5, REPS>(c, name);
-    benchmark_mul_latency::<EF5, L_REPS>(c, name);
-}
-
-criterion_group!(
-    bench_goldilocks_ef,
-    bench_quadratic_extension,
-    bench_quintic_extension
-);
-criterion_main!(bench_goldilocks_ef);
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs
deleted file mode 100644
index 9d4b410d3..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs
+++ /dev/null
@@ -1,343 +0,0 @@
-//! MDS permutation for Goldilocks on aarch64.
-
-use core::arch::aarch64::*;
-use core::mem::transmute;
-
-use p3_mds::MdsPermutation;
-use p3_symmetric::Permutation;
-
-use super::packing::PackedGoldilocksNeon;
-use super::utils::{pack_lanes, unpack_lanes};
-use crate::{Goldilocks, MdsMatrixGoldilocks};
-
-// ---------------------------------------------------------------------------
-// Packed MdsMatrixGoldilocks (delegates to scalar Karatsuba per lane)
-// ---------------------------------------------------------------------------
-
-/// Apply the scalar MDS to each lane of a packed NEON state independently.
-#[inline]
-fn mds_packed<const WIDTH: usize>(
-    mds: &MdsMatrixGoldilocks,
-    input: &mut [PackedGoldilocksNeon; WIDTH],
-) where
-    MdsMatrixGoldilocks: Permutation<[Goldilocks; WIDTH]>,
-{
-    let (mut lane0, mut lane1) = unpack_lanes(input);
-    unsafe {
-        mds.permute_mut(&mut *(&mut lane0 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH]));
-        mds.permute_mut(&mut *(&mut lane1 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH]));
-    }
-    pack_lanes(input, &lane0, &lane1);
-}
-
-impl Permutation<[PackedGoldilocksNeon; 8]> for MdsMatrixGoldilocks {
-    fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 8]) {
-        mds_packed(self, input);
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksNeon, 8> for MdsMatrixGoldilocks {}
-
-impl Permutation<[PackedGoldilocksNeon; 12]> for MdsMatrixGoldilocks {
-    fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 12]) {
-        mds_packed(self, input);
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksNeon, 12> for MdsMatrixGoldilocks {}
-
-// ---------------------------------------------------------------------------
-// NEON-accelerated circulant MDS (16-bit chunk multiply-accumulate)
-// ---------------------------------------------------------------------------
-
-/// Goldilocks identity: `2^64 ≡ 2^32 − 1 (mod P)`.
-const EPSILON_U32: u32 = 0xffffffff;
-
-/// Reduce two accumulated 4×32-bit chunk vectors back to Goldilocks field
-/// elements. Each `uint32x4_t` holds four 32-bit accumulators representing
-/// the four 16-bit chunks of a Goldilocks element:
-///
-/// ```text
-///     elem = c[0] + c[1]·2¹⁶ + c[2]·2³² + c[3]·2⁴⁸
-/// ```
-///
-/// Returns two Goldilocks values packed in a `uint64x2_t`.
-///
-/// Ported from plonky2.
-#[inline(always)]
-unsafe fn mds_reduce([cumul_a, cumul_b]: [uint32x4_t; 2]) -> uint64x2_t {
-    unsafe {
-        let mut lo = vreinterpretq_u64_u32(vuzp1q_u32(cumul_a, cumul_b));
-        let mut hi = vreinterpretq_u64_u32(vuzp2q_u32(cumul_a, cumul_b));
-
-        hi = vsraq_n_u64::<16>(hi, lo);
-        lo = vsliq_n_u64::<16>(lo, hi);
-
-        let top = {
-            let hi_u8 = vreinterpretq_u8_u64(hi);
-            let top_idx =
-                transmute::<[u8; 8], uint8x8_t>([0x06, 0x07, 0xff, 0xff, 0x0e, 0x0f, 0xff, 0xff]);
-            let top_u8 = vqtbl1_u8(hi_u8, top_idx);
-            vreinterpret_u32_u8(top_u8)
-        };
-
-        let adj_lo = vmlal_n_u32(lo, top, EPSILON_U32);
-        let wraparound_mask = vcgtq_u64(lo, adj_lo);
-        vsraq_n_u64::<32>(adj_lo, wraparound_mask)
-    }
-}
-
-/// NEON-accelerated width-8 circulant MDS.
-///
-/// Circulant first row: `[7, 1, 3, 8, 8, 3, 4, 9]`
-/// (matches `MATRIX_CIRC_MDS_8_SML_ROW`).
-#[inline(always)]
-pub unsafe fn mds_neon_w8(state: &[u64; 8]) -> [u64; 8] {
-    unsafe {
-        const ROW: [u32; 8] = [7, 1, 3, 8, 8, 3, 4, 9];
-
-        const M: [[u32; 8]; 8] = {
-            let mut m = [[0u32; 8]; 8];
-            let mut i = 0;
-            while i < 8 {
-                let mut j = 0;
-                while j < 8 {
-                    m[i][j] = ROW[(j + 8 - i) % 8];
-                    j += 1;
-                }
-                i += 1;
-            }
-            m
-        };
-
-        let c: [uint32x4_t; 8] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i])));
-
-        let mut res = [0u64; 8];
-
-        let mut pair = 0;
-        while pair < 4 {
-            let i0 = 2 * pair;
-            let i1 = i0 + 1;
-
-            let mut a0 = vdupq_n_u32(0);
-            let mut a1 = vdupq_n_u32(0);
-
-            let mut j = 0;
-            while j < 8 {
-                a0 = vmlaq_n_u32(a0, c[j], M[i0][j]);
-                a1 = vmlaq_n_u32(a1, c[j], M[i1][j]);
-                j += 1;
-            }
-
-            let r = mds_reduce([a0, a1]);
-            res[i0] = vgetq_lane_u64::<0>(r);
-            res[i1] = vgetq_lane_u64::<1>(r);
-            pair += 1;
-        }
-
-        res
-    }
-}
-
-/// NEON-accelerated width-12 circulant MDS.
-///
-/// Circulant first row: `[1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]`
-/// (matches `MATRIX_CIRC_MDS_12_SML_ROW`).
-#[inline(always)]
-pub unsafe fn mds_neon_w12(state: &[u64; 12]) -> [u64; 12] {
-    unsafe {
-        const ROW: [u32; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10];
-
-        const M: [[u32; 12]; 12] = {
-            let mut m = [[0u32; 12]; 12];
-            let mut i = 0;
-            while i < 12 {
-                let mut j = 0;
-                while j < 12 {
-                    m[i][j] = ROW[(j + 12 - i) % 12];
-                    j += 1;
-                }
-                i += 1;
-            }
-            m
-        };
-
-        let c: [uint32x4_t; 12] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i])));
-
-        let mut res = [0u64; 12];
-
-        let mut pair = 0;
-        while pair < 6 {
-            let i0 = 2 * pair;
-            let i1 = i0 + 1;
-
-            let mut a0 = vdupq_n_u32(0);
-            let mut a1 = vdupq_n_u32(0);
-
-            let mut j = 0;
-            while j < 12 {
-                a0 = vmlaq_n_u32(a0, c[j], M[i0][j]);
-                a1 = vmlaq_n_u32(a1, c[j], M[i1][j]);
-                j += 1;
-            }
-
-            let r = mds_reduce([a0, a1]);
-            res[i0] = vgetq_lane_u64::<0>(r);
-            res[i1] = vgetq_lane_u64::<1>(r);
-            pair += 1;
-        }
-
-        res
-    }
-}
-
-/// NEON-accelerated MDS wrapper for use with the generic Poseidon1.
-///
-/// Zero-sized type that implements `Permutation<[Goldilocks; 8]>` and
-/// `Permutation<[Goldilocks; 12]>` using the NEON chunk technique. Plugs
-/// into `Poseidon1ExternalLayerGeneric` to accelerate full-round MDS while
-/// keeping LLVM-optimized partial rounds from the generic Poseidon1.
-#[derive(Clone, Debug, Default)]
-pub struct MdsNeonGoldilocks;
-
-impl Permutation<[Goldilocks; 8]> for MdsNeonGoldilocks {
-    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
-        let raw = unsafe { &*(state as *const [Goldilocks; 8] as *const [u64; 8]) };
-        let result = unsafe { mds_neon_w8(raw) };
-        *unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) } = result;
-    }
-}
-
-impl Permutation<[Goldilocks; 12]> for MdsNeonGoldilocks {
-    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
-        let raw = unsafe { &*(state as *const [Goldilocks; 12] as *const [u64; 12]) };
-        let result = unsafe { mds_neon_w12(raw) };
-        *unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) } = result;
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field::PrimeField64;
-    use p3_symmetric::Permutation;
-    use rand::rngs::SmallRng;
-    use rand::{RngExt, SeedableRng};
-
-    use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksNeon};
-
-    type F = Goldilocks;
-
-    // -- Packed MdsMatrixGoldilocks tests --
-
-    macro_rules! test_neon_mds {
-        ($name:ident, $width:literal) => {
-            #[test]
-            fn $name() {
-                let mut rng = SmallRng::seed_from_u64(1);
-                let mds = MdsMatrixGoldilocks;
-
-                let input: [Goldilocks; $width] = rng.random();
-                let expected = mds.permute(input);
-
-                let packed_input = input.map(Into::<PackedGoldilocksNeon>::into);
-                let packed_output = mds.permute(packed_input);
-
-                let neon_output = packed_output.map(|x| x.0[0]);
-                assert_eq!(neon_output, expected);
-            }
-        };
-    }
-
-    test_neon_mds!(test_neon_mds_width_8, 8);
-    test_neon_mds!(test_neon_mds_width_12, 12);
-
-    // -- NEON MDS correctness tests --
-
-    #[test]
-    fn test_mds_neon_w8_matches_karatsuba() {
-        let mds = MdsMatrixGoldilocks;
-        let mut rng = SmallRng::seed_from_u64(42);
-
-        for _ in 0..100 {
-            let input: [F; 8] = rng.random();
-            let expected = mds.permute(input);
-
-            let raw: [u64; 8] = input.map(|x| x.as_canonical_u64());
-            let result = unsafe { super::mds_neon_w8(&raw) };
-
-            for i in 0..8 {
-                assert_eq!(
-                    F::new(result[i]).as_canonical_u64(),
-                    expected[i].as_canonical_u64(),
-                    "NEON MDS w8 mismatch at index {i}"
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_mds_neon_w12_matches_karatsuba() {
-        let mds = MdsMatrixGoldilocks;
-        let mut rng = SmallRng::seed_from_u64(43);
-
-        for _ in 0..100 {
-            let input: [F; 12] = rng.random();
-            let expected = mds.permute(input);
-
-            let raw: [u64; 12] = input.map(|x| x.as_canonical_u64());
-            let result = unsafe { super::mds_neon_w12(&raw) };
-
-            for i in 0..12 {
-                assert_eq!(
-                    F::new(result[i]).as_canonical_u64(),
-                    expected[i].as_canonical_u64(),
-                    "NEON MDS w12 mismatch at index {i}"
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_mds_neon_boundary_w8() {
-        let mds = MdsMatrixGoldilocks;
-        let p_minus_1 = F::ORDER_U64 - 1;
-
-        for &val in &[0u64, 1, p_minus_1] {
-            let input: [F; 8] = [F::new(val); 8];
-            let expected = mds.permute(input);
-
-            let raw = [val; 8];
-            let result = unsafe { super::mds_neon_w8(&raw) };
-
-            for i in 0..8 {
-                assert_eq!(
-                    F::new(result[i]).as_canonical_u64(),
-                    expected[i].as_canonical_u64(),
-                    "NEON MDS w8 boundary mismatch at index {i} for value {val}"
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_mds_neon_boundary_w12() {
-        let mds = MdsMatrixGoldilocks;
-        let p_minus_1 = F::ORDER_U64 - 1;
-
-        for &val in &[0u64, 1, p_minus_1] {
-            let input: [F; 12] = [F::new(val); 12];
-            let expected = mds.permute(input);
-
-            let raw = [val; 12];
-            let result = unsafe { super::mds_neon_w12(&raw) };
-
-            for i in 0..12 {
-                assert_eq!(
-                    F::new(result[i]).as_canonical_u64(),
-                    expected[i].as_canonical_u64(),
-                    "NEON MDS w12 boundary mismatch at index {i} for value {val}"
-                );
-            }
-        }
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs
deleted file mode 100644
index 82516a6cf..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-mod mds;
-mod packing;
-mod poseidon1;
-mod poseidon1_asm;
-mod poseidon2;
-mod poseidon2_asm;
-mod utils;
-
-pub use mds::MdsNeonGoldilocks;
-pub use packing::*;
-pub use poseidon1::*;
-pub use poseidon2::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs
deleted file mode 100644
index f393c3b65..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs
+++ /dev/null
@@ -1,404 +0,0 @@
-use alloc::vec::Vec;
-use core::arch::aarch64::{
-    uint64x2_t, vaddq_u64, vandq_u64, vbicq_u64, vcgtq_s64, vdupq_n_u64, veorq_u64, vgetq_lane_u64,
-    vreinterpretq_s64_u64, vsetq_lane_u64, vshrq_n_u64, vsubq_u64,
-};
-use core::fmt::Debug;
-use core::iter::{Product, Sum};
-use core::mem::transmute;
-use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
-
-use p3_field::exponentiation::exp_10540996611094048183;
-use p3_field::op_assign_macros::{
-    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods,
-    impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field,
-    ring_sum,
-};
-use p3_field::{
-    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue,
-    PermutationMonomial, PrimeCharacteristicRing, PrimeField64,
-};
-use p3_util::reconstitute_from_base;
-use rand::distr::{Distribution, StandardUniform};
-use rand::{Rng, RngExt};
-
-use crate::{Goldilocks, P};
-
-const WIDTH: usize = 2;
-
-/// Equal to `2^32 - 1 = 2^64 mod P`.
-const EPSILON: u64 = Goldilocks::ORDER_U64.wrapping_neg();
-
-/// Vectorized NEON implementation of `Goldilocks` arithmetic.
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
-#[repr(transparent)]
-#[must_use]
-pub struct PackedGoldilocksNeon(pub [Goldilocks; WIDTH]);
-
-impl PackedGoldilocksNeon {
-    #[inline]
-    #[must_use]
-    pub(crate) fn to_vector(self) -> uint64x2_t {
-        unsafe { transmute(self) }
-    }
-
-    #[inline]
-    pub(crate) fn from_vector(vector: uint64x2_t) -> Self {
-        unsafe { transmute(vector) }
-    }
-
-    #[inline]
-    const fn broadcast(value: Goldilocks) -> Self {
-        Self([value; WIDTH])
-    }
-}
-
-impl From<Goldilocks> for PackedGoldilocksNeon {
-    fn from(x: Goldilocks) -> Self {
-        Self::broadcast(x)
-    }
-}
-
-impl Add for PackedGoldilocksNeon {
-    type Output = Self;
-    #[inline]
-    fn add(self, rhs: Self) -> Self {
-        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl Sub for PackedGoldilocksNeon {
-    type Output = Self;
-    #[inline]
-    fn sub(self, rhs: Self) -> Self {
-        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl Neg for PackedGoldilocksNeon {
-    type Output = Self;
-    #[inline]
-    fn neg(self) -> Self {
-        Self::from_vector(neg(self.to_vector()))
-    }
-}
-
-impl Mul for PackedGoldilocksNeon {
-    type Output = Self;
-    #[inline]
-    fn mul(self, rhs: Self) -> Self {
-        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl_add_assign!(PackedGoldilocksNeon);
-impl_sub_assign!(PackedGoldilocksNeon);
-impl_mul_methods!(PackedGoldilocksNeon);
-ring_sum!(PackedGoldilocksNeon);
-impl_rng!(PackedGoldilocksNeon);
-
-impl PrimeCharacteristicRing for PackedGoldilocksNeon {
-    type PrimeSubfield = Goldilocks;
-
-    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
-    const ONE: Self = Self::broadcast(Goldilocks::ONE);
-    const TWO: Self = Self::broadcast(Goldilocks::TWO);
-    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
-
-    #[inline]
-    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
-        f.into()
-    }
-
-    #[inline]
-    fn halve(&self) -> Self {
-        Self::from_vector(halve(self.to_vector()))
-    }
-
-    #[inline]
-    fn square(&self) -> Self {
-        Self::from_vector(square(self.to_vector()))
-    }
-
-    #[inline]
-    fn zero_vec(len: usize) -> Vec<Self> {
-        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
-    }
-}
-
-impl InjectiveMonomial<7> for PackedGoldilocksNeon {}
-
-impl PermutationMonomial<7> for PackedGoldilocksNeon {
-    fn injective_exp_root_n(&self) -> Self {
-        exp_10540996611094048183(*self)
-    }
-}
-
-impl_add_base_field!(PackedGoldilocksNeon, Goldilocks);
-impl_sub_base_field!(PackedGoldilocksNeon, Goldilocks);
-impl_mul_base_field!(PackedGoldilocksNeon, Goldilocks);
-impl_div_methods!(PackedGoldilocksNeon, Goldilocks);
-impl_sum_prod_base_field!(PackedGoldilocksNeon, Goldilocks);
-
-impl Algebra<Goldilocks> for PackedGoldilocksNeon {
-    // Benchmarked on AArch64 NEON: chunk=2 ≈ 182ns, chunk=4 ≈ 198ns, chunk=8 ≈ 221ns.
-    const BATCHED_LC_CHUNK: usize = 2;
-}
-
-impl_packed_value!(PackedGoldilocksNeon, Goldilocks, WIDTH);
-
-unsafe impl PackedField for PackedGoldilocksNeon {
-    type Scalar = Goldilocks;
-}
-
-/// Interleave two 64-bit vectors at the element level.
-/// For block_len=1: [a0, a1] x [b0, b1] -> [a0, b0], [a1, b1]
-#[inline]
-pub fn interleave_u64(v0: uint64x2_t, v1: uint64x2_t) -> (uint64x2_t, uint64x2_t) {
-    unsafe {
-        let a0 = vgetq_lane_u64::<0>(v0);
-        let a1 = vgetq_lane_u64::<1>(v0);
-        let b0 = vgetq_lane_u64::<0>(v1);
-        let b1 = vgetq_lane_u64::<1>(v1);
-
-        // r0 = [a0, b0], r1 = [a1, b1]
-        let r0 = vsetq_lane_u64::<1>(b0, vsetq_lane_u64::<0>(a0, vdupq_n_u64(0)));
-        let r1 = vsetq_lane_u64::<1>(b1, vsetq_lane_u64::<0>(a1, vdupq_n_u64(0)));
-
-        (r0, r1)
-    }
-}
-
-unsafe impl PackedFieldPow2 for PackedGoldilocksNeon {
-    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
-        let (v0, v1) = (self.to_vector(), other.to_vector());
-        let (res0, res1) = match block_len {
-            1 => interleave_u64(v0, v1),
-            2 => (v0, v1),
-            _ => panic!("unsupported block length"),
-        };
-        (Self::from_vector(res0), Self::from_vector(res1))
-    }
-}
-
-// NEON arithmetic uses shifted representation (XOR with 2^63) for unsigned comparison.
-
-const SIGN_BIT: uint64x2_t = unsafe { transmute([i64::MIN as u64; WIDTH]) };
-const SHIFTED_FIELD_ORDER: uint64x2_t =
-    unsafe { transmute([Goldilocks::ORDER_U64 ^ (i64::MIN as u64); WIDTH]) };
-const EPSILON_VEC: uint64x2_t = unsafe { transmute([EPSILON; WIDTH]) };
-
-#[inline(always)]
-fn shift(x: uint64x2_t) -> uint64x2_t {
-    unsafe { veorq_u64(x, SIGN_BIT) }
-}
-
-#[inline(always)]
-unsafe fn canonicalize_s(x_s: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let x_s_signed = vreinterpretq_s64_u64(x_s);
-        let order_s_signed = vreinterpretq_s64_u64(SHIFTED_FIELD_ORDER);
-        let mask = vcgtq_s64(order_s_signed, x_s_signed);
-        let wrapback_amt = vbicq_u64(EPSILON_VEC, mask);
-        vaddq_u64(x_s, wrapback_amt)
-    }
-}
-
-#[inline(always)]
-unsafe fn add_no_double_overflow_64_64s_s(x: uint64x2_t, y_s: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let res_wrapped_s = vaddq_u64(x, y_s);
-        // After XOR shift, signed comparison correctly detects overflow.
-        // Overflow occurred iff y_s > res_wrapped_s (as signed, due to shift semantics)
-        let y_s_signed = vreinterpretq_s64_u64(y_s);
-        let res_s_signed = vreinterpretq_s64_u64(res_wrapped_s);
-        let mask = vcgtq_s64(y_s_signed, res_s_signed);
-        // wrapback_amt is EPSILON on overflow
-        let wrapback_amt = vshrq_n_u64::<32>(mask);
-        vaddq_u64(res_wrapped_s, wrapback_amt)
-    }
-}
-
-/// Goldilocks modular addition.
-#[inline]
-fn add(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let y_s = shift(y);
-        let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s));
-        shift(res_s)
-    }
-}
-
-/// Goldilocks modular subtraction.
-#[inline]
-fn sub(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let mut y_s = shift(y);
-        y_s = canonicalize_s(y_s);
-        let x_s = shift(x);
-        let y_s_signed = vreinterpretq_s64_u64(y_s);
-        let x_s_signed = vreinterpretq_s64_u64(x_s);
-        // -1 if underflow (y > x)
-        let mask = vcgtq_s64(y_s_signed, x_s_signed);
-        let wrapback_amt = vshrq_n_u64::<32>(mask);
-        let res_wrapped = vsubq_u64(x_s, y_s);
-        vsubq_u64(res_wrapped, wrapback_amt)
-    }
-}
-
-/// Goldilocks modular negation.
-#[inline]
-fn neg(y: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let y_s = shift(y);
-        vsubq_u64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s))
-    }
-}
-
-/// Halve a vector of Goldilocks field elements.
-#[inline(always)]
-pub(crate) fn halve(input: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let one = vdupq_n_u64(1);
-        let zero = vdupq_n_u64(0);
-        let half = vdupq_n_u64(P.div_ceil(2));
-
-        let least_bit = vandq_u64(input, one);
-        let t = vshrq_n_u64::<1>(input);
-        // neg_least_bit is 0 or -1 (all bits 1)
-        let neg_least_bit = vsubq_u64(zero, least_bit);
-        let maybe_half = vandq_u64(half, neg_least_bit);
-        vaddq_u64(t, maybe_half)
-    }
-}
-
-/// Goldilocks modular multiplication using interleaved dual-lane ASM.
-#[inline]
-fn mul(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let x0 = vgetq_lane_u64::<0>(x);
-        let x1 = vgetq_lane_u64::<1>(x);
-        let y0 = vgetq_lane_u64::<0>(y);
-        let y1 = vgetq_lane_u64::<1>(y);
-
-        let (res_0, res_1) = mul_reduce_dual_asm(x0, y0, x1, y1);
-
-        transmute([res_0, res_1])
-    }
-}
-
-/// Interleaved dual-lane multiplication and reduction using scalar ASM.
-/// Uses shift-based EPSILON multiplication: hi_lo * EPSILON = (hi_lo << 32) - hi_lo
-#[inline(always)]
-unsafe fn mul_reduce_dual_asm(a0: u64, b0: u64, a1: u64, b1: u64) -> (u64, u64) {
-    use core::arch::asm;
-    let result0: u64;
-    let result1: u64;
-
-    unsafe {
-        asm!(
-            // Compute both 128-bit products (interleaved for ILP)
-            "mul   {lo0}, {a0}, {b0}",
-            "mul   {lo1}, {a1}, {b1}",
-            "umulh {hi0}, {a0}, {b0}",
-            "umulh {hi1}, {a1}, {b1}",
-
-            // hi_hi = hi >> 32
-            "lsr   {hi_hi0}, {hi0}, #32",
-            "lsr   {hi_hi1}, {hi1}, #32",
-
-            // tmp = lo - hi_hi (with borrow handling)
-            "subs  {tmp0}, {lo0}, {hi_hi0}",
-            "csetm {adj0:w}, cc",
-            "subs  {tmp1}, {lo1}, {hi_hi1}",
-            "csetm {adj1:w}, cc",
-            "sub   {tmp0}, {tmp0}, {adj0}",
-            "sub   {tmp1}, {tmp1}, {adj1}",
-
-            // hi_lo = hi & EPSILON
-            "and   {hi_lo0}, {hi0}, {epsilon}",
-            "and   {hi_lo1}, {hi1}, {epsilon}",
-
-            // hi_lo_eps = (hi_lo << 32) - hi_lo (avoids multiply)
-            "lsl   {t0}, {hi_lo0}, #32",
-            "lsl   {t1}, {hi_lo1}, #32",
-            "sub   {hi_lo_eps0}, {t0}, {hi_lo0}",
-            "sub   {hi_lo_eps1}, {t1}, {hi_lo1}",
-
-            // result = tmp + hi_lo_eps (with overflow handling)
-            "adds  {result0}, {tmp0}, {hi_lo_eps0}",
-            "csetm {adj0:w}, cs",
-            "adds  {result1}, {tmp1}, {hi_lo_eps1}",
-            "csetm {adj1:w}, cs",
-            "add   {result0}, {result0}, {adj0}",
-            "add   {result1}, {result1}, {adj1}",
-
-            a0 = in(reg) a0,
-            b0 = in(reg) b0,
-            a1 = in(reg) a1,
-            b1 = in(reg) b1,
-            epsilon = in(reg) EPSILON,
-            lo0 = out(reg) _,
-            lo1 = out(reg) _,
-            hi0 = out(reg) _,
-            hi1 = out(reg) _,
-            hi_hi0 = out(reg) _,
-            hi_hi1 = out(reg) _,
-            tmp0 = out(reg) _,
-            tmp1 = out(reg) _,
-            hi_lo0 = out(reg) _,
-            hi_lo1 = out(reg) _,
-            t0 = out(reg) _,
-            t1 = out(reg) _,
-            hi_lo_eps0 = out(reg) _,
-            hi_lo_eps1 = out(reg) _,
-            adj0 = out(reg) _,
-            adj1 = out(reg) _,
-            result0 = out(reg) result0,
-            result1 = out(reg) result1,
-            options(pure, nomem, nostack),
-        );
-    }
-
-    (result0, result1)
-}
-
-/// Goldilocks modular square using interleaved dual-lane ASM.
-#[inline]
-fn square(x: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let x0 = vgetq_lane_u64::<0>(x);
-        let x1 = vgetq_lane_u64::<1>(x);
-
-        let (res_0, res_1) = mul_reduce_dual_asm(x0, x0, x1, x1);
-
-        transmute([res_0, res_1])
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field_testing::test_packed_field;
-
-    use super::{Goldilocks, PackedGoldilocksNeon, WIDTH};
-
-    const SPECIAL_VALS: [Goldilocks; WIDTH] =
-        Goldilocks::new_array([0xFFFF_FFFF_0000_0000, 0xFFFF_FFFF_FFFF_FFFF]);
-
-    const ZEROS: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([
-        0x0000_0000_0000_0000,
-        0xFFFF_FFFF_0000_0001, // = P, canonicalizes to 0
-    ]));
-
-    const ONES: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0002, // = P + 1, canonicalizes to 1
-    ]));
-
-    test_packed_field!(
-        crate::PackedGoldilocksNeon,
-        &[super::ZEROS],
-        &[super::ONES],
-        crate::PackedGoldilocksNeon(super::SPECIAL_VALS)
-    );
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs
deleted file mode 100644
index 0a877578a..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs
+++ /dev/null
@@ -1,716 +0,0 @@
-//! Fused Poseidon1 permutation for Goldilocks on aarch64.
-
-use alloc::vec::Vec;
-
-use p3_poseidon1::{
-    FullRoundConstants, PartialRoundConstants, full_round_initial_permute_state,
-    full_round_terminal_permute_state, partial_permute_state,
-};
-use p3_symmetric::{CryptographicPermutation, Permutation};
-
-use super::mds::{MdsNeonGoldilocks, mds_neon_w8, mds_neon_w12};
-use super::packing::PackedGoldilocksNeon;
-use super::poseidon1_asm::*;
-use super::poseidon2_asm::{sbox_layer_asm, sbox_layer_dual_asm};
-use super::utils::{pack_lanes, unpack_lanes};
-use crate::Goldilocks;
-
-/// Fused Poseidon1 permutation for Goldilocks.
-///
-/// Holds the pre-extracted raw `u64` constants from the optimized Poseidon1
-/// sparse-matrix decomposition. Storing raw values avoids field-element
-/// overhead in the hot inner loop.
-#[derive(Clone, Debug)]
-pub struct Poseidon1GoldilocksFused<const WIDTH: usize> {
-    /// Round constants for the initial full rounds (RF/2 vectors).
-    initial_constants_raw: Vec<[u64; WIDTH]>,
-    /// Round constants for the terminal full rounds (RF/2 vectors).
-    terminal_constants_raw: Vec<[u64; WIDTH]>,
-    /// Full-width constant vector for the first partial round.
-    first_round_constants_raw: [u64; WIDTH],
-    /// Dense transition matrix applied once before entering the partial-round loop.
-    m_i_raw: [[u64; WIDTH]; WIDTH],
-    /// Per-round first row of the sparse matrix (one per partial round).
-    sparse_first_row_raw: Vec<[u64; WIDTH]>,
-    /// Per-round sub-diagonal vector for the sparse matmul (one per partial round).
-    v_raw: Vec<[u64; WIDTH]>,
-    /// Scalar round constants for partial rounds 0 through RP-2.
-    ///
-    /// The last partial round has no scalar constant (it ends with the S-box only).
-    round_constants_raw: Vec<u64>,
-}
-
-impl<const WIDTH: usize> Poseidon1GoldilocksFused<WIDTH> {
-    /// Create from pre-computed full and partial round constants.
-    ///
-    /// Extracts the raw `u64` representation from each Goldilocks field
-    /// element, building the flat arrays that the ASM kernels consume.
-    pub fn new(
-        full: &FullRoundConstants<Goldilocks, WIDTH>,
-        partial: &PartialRoundConstants<Goldilocks, WIDTH>,
-    ) -> Self {
-        // Extract raw u64 values from full-round constant matrices.
-        let initial_constants_raw = full
-            .initial
-            .iter()
-            .map(|rc| core::array::from_fn(|i| rc[i].value))
-            .collect();
-        let terminal_constants_raw = full
-            .terminal
-            .iter()
-            .map(|rc| core::array::from_fn(|i| rc[i].value))
-            .collect();
-
-        // Extract the first partial-round constant vector.
-        let first_round_constants_raw =
-            core::array::from_fn(|i| partial.first_round_constants[i].value);
-
-        // Extract the dense transition matrix.
-        let m_i_raw = core::array::from_fn(|i| core::array::from_fn(|j| partial.m_i[i][j].value));
-
-        // Extract per-round sparse matrix data.
-        let sparse_first_row_raw = partial
-            .sparse_first_row
-            .iter()
-            .map(|r| core::array::from_fn(|i| r[i].value))
-            .collect();
-        let v_raw = partial
-            .v
-            .iter()
-            .map(|r| core::array::from_fn(|i| r[i].value))
-            .collect();
-
-        // Extract scalar round constants for partial rounds.
-        let round_constants_raw = partial.round_constants.iter().map(|c| c.value).collect();
-
-        Self {
-            initial_constants_raw,
-            terminal_constants_raw,
-            first_round_constants_raw,
-            m_i_raw,
-            sparse_first_row_raw,
-            v_raw,
-            round_constants_raw,
-        }
-    }
-}
-
-/// Run the initial or terminal full rounds on a raw width-8 state.
-///
-/// Each full round applies: add constants, S-box on all elements, NEON MDS.
-#[inline]
-fn full_rounds_scalar_w8(raw: &mut [u64; 8], constants: &[[u64; 8]]) {
-    for rc in constants {
-        unsafe {
-            add_rc_asm(raw, rc);
-            sbox_layer_asm(raw);
-        }
-        *raw = unsafe { mds_neon_w8(raw) };
-    }
-}
-
-/// Run the initial or terminal full rounds on a raw width-12 state.
-///
-/// Each full round applies: add constants, S-box on all elements, NEON MDS.
-#[inline]
-fn full_rounds_scalar_w12(raw: &mut [u64; 12], constants: &[[u64; 12]]) {
-    for rc in constants {
-        unsafe {
-            add_rc_asm(raw, rc);
-            sbox_layer_asm(raw);
-        }
-        *raw = unsafe { mds_neon_w12(raw) };
-    }
-}
-
-/// Run all partial rounds on a raw width-8 state.
-///
-/// The partial-round sequence is:
-/// 1. Add the first-round full-width constant vector.
-/// 2. Apply the dense transition matrix once.
-/// 3. For each partial round (except the last):
-///    S-box on first element, add scalar constant, sparse matmul.
-/// 4. Last partial round: S-box on first element, sparse matmul (no constant).
-#[inline]
-fn partial_rounds_scalar_w8(
-    raw: &mut [u64; 8],
-    first_rc: &[u64; 8],
-    m_i: &[[u64; 8]; 8],
-    sparse_first_row: &[[u64; 8]],
-    v: &[[u64; 8]],
-    round_constants: &[u64],
-) {
-    // Add the first-round full-width constant vector.
-    unsafe {
-        add_rc_asm(raw, first_rc);
-    }
-
-    // Apply the dense transition matrix once.
-    dense_matmul_asm_w8(raw, m_i);
-
-    // Main partial-round loop: S-box + scalar constant + sparse matmul.
-    let rounds_p = sparse_first_row.len();
-    for r in 0..rounds_p - 1 {
-        unsafe {
-            sbox_s0_asm(raw);
-            add_scalar_s0_asm(raw, round_constants[r]);
-            cheap_matmul_asm_w8(raw, &sparse_first_row[r], &v[r]);
-        }
-    }
-
-    // Last partial round: no scalar constant.
-    unsafe {
-        sbox_s0_asm(raw);
-        cheap_matmul_asm_w8(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]);
-    }
-}
-
-/// Run all partial rounds on a raw width-12 state.
-///
-/// Same structure as the width-8 variant.
-#[inline]
-fn partial_rounds_scalar_w12(
-    raw: &mut [u64; 12],
-    first_rc: &[u64; 12],
-    m_i: &[[u64; 12]; 12],
-    sparse_first_row: &[[u64; 12]],
-    v: &[[u64; 12]],
-    round_constants: &[u64],
-) {
-    unsafe {
-        add_rc_asm(raw, first_rc);
-    }
-    dense_matmul_asm_w12(raw, m_i);
-
-    let rounds_p = sparse_first_row.len();
-    for r in 0..rounds_p - 1 {
-        unsafe {
-            sbox_s0_asm(raw);
-            add_scalar_s0_asm(raw, round_constants[r]);
-            cheap_matmul_asm_w12(raw, &sparse_first_row[r], &v[r]);
-        }
-    }
-    unsafe {
-        sbox_s0_asm(raw);
-        cheap_matmul_asm_w12(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]);
-    }
-}
-
-/// Run the initial or terminal full rounds on two raw width-8 lanes.
-///
-/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane.
-#[inline]
-fn full_rounds_dual_w8(lane0: &mut [u64; 8], lane1: &mut [u64; 8], constants: &[[u64; 8]]) {
-    for rc in constants {
-        unsafe {
-            add_rc_dual_asm(lane0, lane1, rc);
-            sbox_layer_dual_asm(lane0, lane1);
-        }
-        *lane0 = unsafe { mds_neon_w8(lane0) };
-        *lane1 = unsafe { mds_neon_w8(lane1) };
-    }
-}
-
-/// Run the initial or terminal full rounds on two raw width-12 lanes.
-///
-/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane.
-#[inline]
-fn full_rounds_dual_w12(lane0: &mut [u64; 12], lane1: &mut [u64; 12], constants: &[[u64; 12]]) {
-    for rc in constants {
-        unsafe {
-            add_rc_dual_asm(lane0, lane1, rc);
-            sbox_layer_dual_asm(lane0, lane1);
-        }
-        *lane0 = unsafe { mds_neon_w12(lane0) };
-        *lane1 = unsafe { mds_neon_w12(lane1) };
-    }
-}
-
-/// Run all partial rounds on two width-8 lanes simultaneously.
-///
-/// Uses dual-lane S-box and sparse matmul primitives to keep the
-/// pipeline full. The scalar constant is added to each lane separately
-/// (no dual variant needed for a single-element addition).
-#[inline]
-fn partial_rounds_dual_w8(
-    lane0: &mut [u64; 8],
-    lane1: &mut [u64; 8],
-    first_rc: &[u64; 8],
-    m_i: &[[u64; 8]; 8],
-    sparse_first_row: &[[u64; 8]],
-    v: &[[u64; 8]],
-    round_constants: &[u64],
-) {
-    // Add the first-round constant to both lanes.
-    unsafe {
-        add_rc_dual_asm(lane0, lane1, first_rc);
-    }
-
-    // Dense transition matrix on both lanes.
-    dense_matmul_dual_asm_w8(lane0, lane1, m_i);
-
-    // Main partial-round loop.
-    let rounds_p = sparse_first_row.len();
-    for r in 0..rounds_p - 1 {
-        unsafe {
-            sbox_s0_dual_asm(lane0, lane1);
-            add_scalar_s0_asm(lane0, round_constants[r]);
-            add_scalar_s0_asm(lane1, round_constants[r]);
-            cheap_matmul_dual_asm_w8(lane0, lane1, &sparse_first_row[r], &v[r]);
-        }
-    }
-
-    // Last partial round: no scalar constant.
-    unsafe {
-        sbox_s0_dual_asm(lane0, lane1);
-        cheap_matmul_dual_asm_w8(
-            lane0,
-            lane1,
-            &sparse_first_row[rounds_p - 1],
-            &v[rounds_p - 1],
-        );
-    }
-}
-
-/// Run all partial rounds on two width-12 lanes simultaneously.
-///
-/// Same structure as the width-8 dual variant.
-#[inline]
-fn partial_rounds_dual_w12(
-    lane0: &mut [u64; 12],
-    lane1: &mut [u64; 12],
-    first_rc: &[u64; 12],
-    m_i: &[[u64; 12]; 12],
-    sparse_first_row: &[[u64; 12]],
-    v: &[[u64; 12]],
-    round_constants: &[u64],
-) {
-    unsafe {
-        add_rc_dual_asm(lane0, lane1, first_rc);
-    }
-    dense_matmul_dual_asm_w12(lane0, lane1, m_i);
-
-    let rounds_p = sparse_first_row.len();
-    for r in 0..rounds_p - 1 {
-        unsafe {
-            sbox_s0_dual_asm(lane0, lane1);
-            add_scalar_s0_asm(lane0, round_constants[r]);
-            add_scalar_s0_asm(lane1, round_constants[r]);
-            cheap_matmul_dual_asm_w12(lane0, lane1, &sparse_first_row[r], &v[r]);
-        }
-    }
-    unsafe {
-        sbox_s0_dual_asm(lane0, lane1);
-        cheap_matmul_dual_asm_w12(
-            lane0,
-            lane1,
-            &sparse_first_row[rounds_p - 1],
-            &v[rounds_p - 1],
-        );
-    }
-}
-
-impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
-        // Zero-cost transmute: Goldilocks is repr(transparent) over u64.
-        let raw = unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
-
-        // Initial full rounds, then partial rounds, then terminal full rounds.
-        full_rounds_scalar_w8(raw, &self.initial_constants_raw);
-        partial_rounds_scalar_w8(
-            raw,
-            &self.first_round_constants_raw,
-            &self.m_i_raw,
-            &self.sparse_first_row_raw,
-            &self.v_raw,
-            &self.round_constants_raw,
-        );
-        full_rounds_scalar_w8(raw, &self.terminal_constants_raw);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> {}
-
-impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) {
-        // Unpack the two lanes from the packed representation.
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-
-        // Run the full permutation on both lanes simultaneously.
-        full_rounds_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw);
-        partial_rounds_dual_w8(
-            &mut lane0,
-            &mut lane1,
-            &self.first_round_constants_raw,
-            &self.m_i_raw,
-            &self.sparse_first_row_raw,
-            &self.v_raw,
-            &self.round_constants_raw,
-        );
-        full_rounds_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw);
-
-        // Repack both lanes into the packed representation.
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> {}
-
-impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
-        let raw = unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
-
-        full_rounds_scalar_w12(raw, &self.initial_constants_raw);
-        partial_rounds_scalar_w12(
-            raw,
-            &self.first_round_constants_raw,
-            &self.m_i_raw,
-            &self.sparse_first_row_raw,
-            &self.v_raw,
-            &self.round_constants_raw,
-        );
-        full_rounds_scalar_w12(raw, &self.terminal_constants_raw);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> {}
-
-impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-
-        full_rounds_dual_w12(&mut lane0, &mut lane1, &self.initial_constants_raw);
-        partial_rounds_dual_w12(
-            &mut lane0,
-            &mut lane1,
-            &self.first_round_constants_raw,
-            &self.m_i_raw,
-            &self.sparse_first_row_raw,
-            &self.v_raw,
-            &self.round_constants_raw,
-        );
-        full_rounds_dual_w12(&mut lane0, &mut lane1, &self.terminal_constants_raw);
-
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> {}
-
-/// Dual-dispatch wrapper for Goldilocks Poseidon1.
-///
-/// **Scalar** permutations use the NEON-accelerated MDS for full rounds
-/// and LLVM-optimized sparse matrix decomposition for partial rounds.
-/// This avoids sequential inline ASM that would prevent LLVM's
-/// instruction scheduling optimizations on wide out-of-order cores.
-///
-/// **Packed** permutations delegate to the fused dual-lane ASM path
-/// with NEON MDS for full rounds and sparse matrix for partial rounds
-/// (dual-lane interleaving hides multiply latency).
-#[derive(Clone, Debug)]
-pub struct Poseidon1GoldilocksDispatch<const WIDTH: usize> {
-    /// Fused dual-lane path — used for packed permutations.
-    fused: Poseidon1GoldilocksFused<WIDTH>,
-    /// Pre-computed full round constants for NEON MDS.
-    full_constants: FullRoundConstants<Goldilocks, WIDTH>,
-    /// Pre-computed partial round constants (textbook path for scalar, sparse for packed).
-    partial_constants: PartialRoundConstants<Goldilocks, WIDTH>,
-}
-
-impl<const WIDTH: usize> Poseidon1GoldilocksDispatch<WIDTH> {
-    /// Create from fused and pre-computed constants.
-    pub const fn new(
-        fused: Poseidon1GoldilocksFused<WIDTH>,
-        full_constants: FullRoundConstants<Goldilocks, WIDTH>,
-        partial_constants: PartialRoundConstants<Goldilocks, WIDTH>,
-    ) -> Self {
-        Self {
-            fused,
-            full_constants,
-            partial_constants,
-        }
-    }
-}
-
-// --- Width 8 ---
-
-impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
-        let mds = MdsNeonGoldilocks;
-        full_round_initial_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds);
-        partial_permute_state::<_, _, 8, 7>(state, &self.partial_constants);
-        full_round_terminal_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> {}
-
-impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) {
-        self.fused.permute_mut(state);
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> {}
-
-// --- Width 12 ---
-
-impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
-        let mds = MdsNeonGoldilocks;
-        full_round_initial_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds);
-        partial_permute_state::<_, _, 12, 7>(state, &self.partial_constants);
-        full_round_terminal_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> {}
-
-impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) {
-        // Extract both lanes, run the optimized scalar path on each, repack.
-        // Directly inline the scalar logic (NEON MDS full rounds + sparse partial
-        // rounds) to avoid trait-dispatch overhead and enable cross-call inlining.
-        let mut lane0: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[0]);
-        let mut lane1: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[1]);
-
-        let mds = MdsNeonGoldilocks;
-        full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds);
-        partial_permute_state::<_, _, 12, 7>(&mut lane0, &self.partial_constants);
-        full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds);
-
-        full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds);
-        partial_permute_state::<_, _, 12, 7>(&mut lane1, &self.partial_constants);
-        full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds);
-
-        for i in 0..12 {
-            state[i] = PackedGoldilocksNeon([lane0[i], lane1[i]]);
-        }
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> {}
-
-#[cfg(test)]
-mod tests {
-    use p3_field::{PrimeCharacteristicRing, PrimeField64};
-    use p3_poseidon1::Poseidon1Constants;
-    use p3_symmetric::Permutation;
-    use rand::rngs::SmallRng;
-    use rand::{RngExt, SeedableRng};
-
-    use super::*;
-    use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL};
-    use crate::poseidon1::{
-        GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-        GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, GOLDILOCKS_POSEIDON1_RC_8,
-        GOLDILOCKS_POSEIDON1_RC_12, default_goldilocks_poseidon1_8,
-        default_goldilocks_poseidon1_12,
-    };
-
-    type F = Goldilocks;
-
-    /// Build a width-8 fused permutation from the fixed round constants.
-    fn make_fused_w8() -> Poseidon1GoldilocksFused<8> {
-        let raw = Poseidon1Constants {
-            rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-            rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-            mds_circ_col: MATRIX_CIRC_MDS_8_COL,
-            round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(),
-        };
-        let (full, partial) = raw.to_optimized();
-        Poseidon1GoldilocksFused::new(&full, &partial)
-    }
-
-    /// Build a width-12 fused permutation from the fixed round constants.
-    fn make_fused_w12() -> Poseidon1GoldilocksFused<12> {
-        let raw = Poseidon1Constants {
-            rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-            rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12,
-            mds_circ_col: MATRIX_CIRC_MDS_12_COL,
-            round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(),
-        };
-        let (full, partial) = raw.to_optimized();
-        Poseidon1GoldilocksFused::new(&full, &partial)
-    }
-
-    /// Verify that the fused width-8 implementation matches the generic one
-    /// on both zero and random inputs.
-    #[test]
-    fn test_fused_matches_generic_w8() {
-        let generic = default_goldilocks_poseidon1_8();
-        let fused = make_fused_w8();
-        let mut rng = SmallRng::seed_from_u64(42);
-
-        // Zero input.
-        let mut g_state = [F::ZERO; 8];
-        let mut f_state = [F::ZERO; 8];
-        generic.permute_mut(&mut g_state);
-        fused.permute_mut(&mut f_state);
-        for i in 0..8 {
-            assert_eq!(
-                f_state[i].as_canonical_u64(),
-                g_state[i].as_canonical_u64(),
-                "Fused vs generic mismatch at index {i} (zero input, w8)"
-            );
-        }
-
-        // Random input.
-        let mut g_state: [F; 8] = rng.random();
-        let mut f_state = g_state;
-        generic.permute_mut(&mut g_state);
-        fused.permute_mut(&mut f_state);
-        for i in 0..8 {
-            assert_eq!(
-                f_state[i].as_canonical_u64(),
-                g_state[i].as_canonical_u64(),
-                "Fused vs generic mismatch at index {i} (random input, w8)"
-            );
-        }
-    }
-
-    /// Same fused-vs-generic verification for width 12.
-    #[test]
-    fn test_fused_matches_generic_w12() {
-        let generic = default_goldilocks_poseidon1_12();
-        let fused = make_fused_w12();
-        let mut rng = SmallRng::seed_from_u64(42);
-
-        let mut g_state = [F::ZERO; 12];
-        let mut f_state = [F::ZERO; 12];
-        generic.permute_mut(&mut g_state);
-        fused.permute_mut(&mut f_state);
-        for i in 0..12 {
-            assert_eq!(
-                f_state[i].as_canonical_u64(),
-                g_state[i].as_canonical_u64(),
-                "Fused vs generic mismatch at index {i} (zero input, w12)"
-            );
-        }
-
-        let mut g_state: [F; 12] = rng.random();
-        let mut f_state = g_state;
-        generic.permute_mut(&mut g_state);
-        fused.permute_mut(&mut f_state);
-        for i in 0..12 {
-            assert_eq!(
-                f_state[i].as_canonical_u64(),
-                g_state[i].as_canonical_u64(),
-                "Fused vs generic mismatch at index {i} (random input, w12)"
-            );
-        }
-    }
-
-    /// Verify that the packed (dual-lane) width-8 path matches running
-    /// two independent scalar permutations.
-    #[test]
-    fn test_packed_matches_scalar_w8() {
-        let fused = make_fused_w8();
-        let mut rng = SmallRng::seed_from_u64(123);
-
-        // Two independent random scalar inputs.
-        let scalar_a: [F; 8] = rng.random();
-        let scalar_b: [F; 8] = rng.random();
-
-        // Pack them into a single packed state and permute.
-        let mut packed: [PackedGoldilocksNeon; 8] =
-            core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]]));
-        fused.permute_mut(&mut packed);
-
-        // Compute the expected result by running scalar on each independently.
-        let mut expected_a = scalar_a;
-        let mut expected_b = scalar_b;
-        fused.permute_mut(&mut expected_a);
-        fused.permute_mut(&mut expected_b);
-
-        // Lane 0 must match the first scalar, lane 1 must match the second.
-        for i in 0..8 {
-            assert_eq!(
-                packed[i].0[0].as_canonical_u64(),
-                expected_a[i].as_canonical_u64(),
-                "Packed lane0 mismatch at index {i} (w8)"
-            );
-            assert_eq!(
-                packed[i].0[1].as_canonical_u64(),
-                expected_b[i].as_canonical_u64(),
-                "Packed lane1 mismatch at index {i} (w8)"
-            );
-        }
-    }
-
-    /// Same packed-vs-scalar verification for width 12.
-    #[test]
-    fn test_packed_matches_scalar_w12() {
-        let fused = make_fused_w12();
-        let mut rng = SmallRng::seed_from_u64(123);
-
-        let scalar_a: [F; 12] = rng.random();
-        let scalar_b: [F; 12] = rng.random();
-
-        let mut packed: [PackedGoldilocksNeon; 12] =
-            core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]]));
-        fused.permute_mut(&mut packed);
-
-        let mut expected_a = scalar_a;
-        let mut expected_b = scalar_b;
-        fused.permute_mut(&mut expected_a);
-        fused.permute_mut(&mut expected_b);
-
-        for i in 0..12 {
-            assert_eq!(
-                packed[i].0[0].as_canonical_u64(),
-                expected_a[i].as_canonical_u64(),
-                "Packed lane0 mismatch at index {i} (w12)"
-            );
-            assert_eq!(
-                packed[i].0[1].as_canonical_u64(),
-                expected_b[i].as_canonical_u64(),
-                "Packed lane1 mismatch at index {i} (w12)"
-            );
-        }
-    }
-
-    /// Known-answer test for width 8 (sequential 0..7 input).
-    #[test]
-    fn test_fused_kat_w8() {
-        let fused = make_fused_w8();
-        let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
-        fused.permute_mut(&mut input);
-
-        let expected: [F; 8] = F::new_array([
-            2431226948502761687,
-            9427563026145807618,
-            6827549936272051660,
-            16907684411084503785,
-            10131745626715172913,
-            17448305483431576765,
-            9066501914269485014,
-            12095238468458521303,
-        ]);
-        assert_eq!(input, expected);
-    }
-
-    /// Known-answer test for width 12 (sequential 0..11 input).
-    #[test]
-    fn test_fused_kat_w12() {
-        let fused = make_fused_w12();
-        let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
-        fused.permute_mut(&mut input);
-
-        let expected: [F; 12] = F::new_array([
-            15595088881848875364,
-            9564850329150784619,
-            13607005230761744521,
-            12117102595842533385,
-            2814257411756993122,
-            11640647689983397089,
-            14363867760831937423,
-            13323891071259596526,
-            11219803511311150468,
-            9221595262780869902,
-            5898229059046891887,
-            18181291031484020550,
-        ]);
-        assert_eq!(input, expected);
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs
deleted file mode 100644
index 3ca1382a9..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs
+++ /dev/null
@@ -1,843 +0,0 @@
-//! ARM assembly primitives for the Poseidon1 permutation over Goldilocks.
-
-use super::utils::{add_asm, mul_add_asm, mul_asm};
-
-// ---------------------------------------------------------------------------
-// S-box: x -> x^7 (applied to the first element only)
-// ---------------------------------------------------------------------------
-
-/// Apply the degree-7 S-box to the first element of the state.
-///
-/// Computes `x^7` using four multiplications via the addition chain:
-///
-/// ```text
-///     x -> x^2 -> x^3 (= x^2 * x)
-///                 x^4 (= x^2 * x^2)
-///                 x^7 (= x^3 * x^4)
-/// ```
-///
-/// Only the first element is modified. All other elements are unchanged.
-/// This corresponds to the non-linear step of a **partial round**.
-#[inline(always)]
-pub unsafe fn sbox_s0_asm(state: &mut [u64]) {
-    unsafe {
-        // Load the first element.
-        let s0 = state[0];
-
-        // Square: x^2.
-        let s0_2 = mul_asm(s0, s0);
-
-        // Cube: x^3 = x^2 * x.
-        let s0_3 = mul_asm(s0_2, s0);
-
-        // Fourth power: x^4 = x^2 * x^2.
-        let s0_4 = mul_asm(s0_2, s0_2);
-
-        // Seventh power: x^7 = x^3 * x^4.
-        state[0] = mul_asm(s0_3, s0_4);
-    }
-}
-
-/// Dual-lane S-box on the first element of two independent states.
-///
-/// Applies the same degree-7 S-box to both first elements. Interleaving
-/// the two chains hides the multiplication latency: while one multiply
-/// retires, the other is already in flight.
-#[inline(always)]
-pub unsafe fn sbox_s0_dual_asm(state0: &mut [u64], state1: &mut [u64]) {
-    unsafe {
-        // Load both first elements.
-        let a = state0[0];
-        let b = state1[0];
-
-        // Square both.
-        let a2 = mul_asm(a, a);
-        let b2 = mul_asm(b, b);
-
-        // Cube both: x^3 = x^2 * x.
-        let a3 = mul_asm(a2, a);
-        let b3 = mul_asm(b2, b);
-
-        // Fourth power both: x^4 = x^2 * x^2.
-        let a4 = mul_asm(a2, a2);
-        let b4 = mul_asm(b2, b2);
-
-        // Seventh power both: x^7 = x^3 * x^4.
-        state0[0] = mul_asm(a3, a4);
-        state1[0] = mul_asm(b3, b4);
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Sparse matrix-vector multiply (partial-round linear layer)
-// ---------------------------------------------------------------------------
-
-/// Sparse matrix-vector multiply for a width-8 state.
-///
-/// Implements the partial-round linear layer. The sparse matrix is
-/// encoded as its first row and a sub-diagonal vector:
-///
-/// ```text
-///     new[0]  = dot(first_row, state)            (dot product)
-///     new[i]  = state[i] + state[0] * v[i-1]   (for i >= 1)
-/// ```
-///
-/// The original first element is captured before the dot product
-/// overwrites it. The unrolled form avoids loop overhead and gives
-/// the scheduler maximum freedom to reorder independent multiply-adds.
-#[inline(always)]
-pub unsafe fn cheap_matmul_asm_w8(state: &mut [u64; 8], first_row: &[u64; 8], v: &[u64; 8]) {
-    unsafe {
-        // Capture the original first element before it gets overwritten.
-        let old_s0 = state[0];
-
-        // Dot product: accumulate dot(first_row, state).
-        let mut acc = mul_asm(state[0], first_row[0]);
-        acc = mul_add_asm(state[1], first_row[1], acc);
-        acc = mul_add_asm(state[2], first_row[2], acc);
-        acc = mul_add_asm(state[3], first_row[3], acc);
-        acc = mul_add_asm(state[4], first_row[4], acc);
-        acc = mul_add_asm(state[5], first_row[5], acc);
-        acc = mul_add_asm(state[6], first_row[6], acc);
-        acc = mul_add_asm(state[7], first_row[7], acc);
-
-        // Tail update: each remaining element gets old_first * v[i-1] added.
-        state[1] = mul_add_asm(old_s0, v[0], state[1]);
-        state[2] = mul_add_asm(old_s0, v[1], state[2]);
-        state[3] = mul_add_asm(old_s0, v[2], state[3]);
-        state[4] = mul_add_asm(old_s0, v[3], state[4]);
-        state[5] = mul_add_asm(old_s0, v[4], state[5]);
-        state[6] = mul_add_asm(old_s0, v[5], state[6]);
-        state[7] = mul_add_asm(old_s0, v[6], state[7]);
-
-        // Write the dot-product result into the first slot.
-        state[0] = acc;
-    }
-}
-
-/// Sparse matrix-vector multiply for a width-12 state.
-///
-/// Same decomposition as the width-8 variant:
-/// - Dot product for the new first element.
-/// - Scalar multiply-add for every other element.
-#[inline(always)]
-pub unsafe fn cheap_matmul_asm_w12(state: &mut [u64; 12], first_row: &[u64; 12], v: &[u64; 12]) {
-    unsafe {
-        // Capture the original first element before it gets overwritten.
-        let old_s0 = state[0];
-
-        // Dot product: accumulate dot(first_row, state).
-        let mut acc = mul_asm(state[0], first_row[0]);
-        acc = mul_add_asm(state[1], first_row[1], acc);
-        acc = mul_add_asm(state[2], first_row[2], acc);
-        acc = mul_add_asm(state[3], first_row[3], acc);
-        acc = mul_add_asm(state[4], first_row[4], acc);
-        acc = mul_add_asm(state[5], first_row[5], acc);
-        acc = mul_add_asm(state[6], first_row[6], acc);
-        acc = mul_add_asm(state[7], first_row[7], acc);
-        acc = mul_add_asm(state[8], first_row[8], acc);
-        acc = mul_add_asm(state[9], first_row[9], acc);
-        acc = mul_add_asm(state[10], first_row[10], acc);
-        acc = mul_add_asm(state[11], first_row[11], acc);
-
-        // Tail update: each remaining element gets old_first * v[i-1] added.
-        state[1] = mul_add_asm(old_s0, v[0], state[1]);
-        state[2] = mul_add_asm(old_s0, v[1], state[2]);
-        state[3] = mul_add_asm(old_s0, v[2], state[3]);
-        state[4] = mul_add_asm(old_s0, v[3], state[4]);
-        state[5] = mul_add_asm(old_s0, v[4], state[5]);
-        state[6] = mul_add_asm(old_s0, v[5], state[6]);
-        state[7] = mul_add_asm(old_s0, v[6], state[7]);
-        state[8] = mul_add_asm(old_s0, v[7], state[8]);
-        state[9] = mul_add_asm(old_s0, v[8], state[9]);
-        state[10] = mul_add_asm(old_s0, v[9], state[10]);
-        state[11] = mul_add_asm(old_s0, v[10], state[11]);
-
-        // Write the dot-product result into the first slot.
-        state[0] = acc;
-    }
-}
-
-/// Dual-lane sparse matrix-vector multiply for a width-8 state.
-///
-/// Processes two independent states through the same sparse matrix
-/// simultaneously. Both lanes share the same first-row and sub-diagonal
-/// vectors, since the matrix is fixed for a given partial round.
-///
-/// Interleaving multiply-adds from both lanes keeps the pipeline full.
-#[inline(always)]
-pub unsafe fn cheap_matmul_dual_asm_w8(
-    s0: &mut [u64; 8],
-    s1: &mut [u64; 8],
-    first_row: &[u64; 8],
-    v: &[u64; 8],
-) {
-    unsafe {
-        // Capture the original first elements from both lanes.
-        let old_a = s0[0];
-        let old_b = s1[0];
-
-        // Dot products: one per lane, interleaved.
-        let mut acc_a = mul_asm(s0[0], first_row[0]);
-        let mut acc_b = mul_asm(s1[0], first_row[0]);
-        acc_a = mul_add_asm(s0[1], first_row[1], acc_a);
-        acc_b = mul_add_asm(s1[1], first_row[1], acc_b);
-        acc_a = mul_add_asm(s0[2], first_row[2], acc_a);
-        acc_b = mul_add_asm(s1[2], first_row[2], acc_b);
-        acc_a = mul_add_asm(s0[3], first_row[3], acc_a);
-        acc_b = mul_add_asm(s1[3], first_row[3], acc_b);
-        acc_a = mul_add_asm(s0[4], first_row[4], acc_a);
-        acc_b = mul_add_asm(s1[4], first_row[4], acc_b);
-        acc_a = mul_add_asm(s0[5], first_row[5], acc_a);
-        acc_b = mul_add_asm(s1[5], first_row[5], acc_b);
-        acc_a = mul_add_asm(s0[6], first_row[6], acc_a);
-        acc_b = mul_add_asm(s1[6], first_row[6], acc_b);
-        acc_a = mul_add_asm(s0[7], first_row[7], acc_a);
-        acc_b = mul_add_asm(s1[7], first_row[7], acc_b);
-
-        // Tail updates: both lanes, interleaved.
-        s0[1] = mul_add_asm(old_a, v[0], s0[1]);
-        s1[1] = mul_add_asm(old_b, v[0], s1[1]);
-        s0[2] = mul_add_asm(old_a, v[1], s0[2]);
-        s1[2] = mul_add_asm(old_b, v[1], s1[2]);
-        s0[3] = mul_add_asm(old_a, v[2], s0[3]);
-        s1[3] = mul_add_asm(old_b, v[2], s1[3]);
-        s0[4] = mul_add_asm(old_a, v[3], s0[4]);
-        s1[4] = mul_add_asm(old_b, v[3], s1[4]);
-        s0[5] = mul_add_asm(old_a, v[4], s0[5]);
-        s1[5] = mul_add_asm(old_b, v[4], s1[5]);
-        s0[6] = mul_add_asm(old_a, v[5], s0[6]);
-        s1[6] = mul_add_asm(old_b, v[5], s1[6]);
-        s0[7] = mul_add_asm(old_a, v[6], s0[7]);
-        s1[7] = mul_add_asm(old_b, v[6], s1[7]);
-
-        // Write the dot-product results into the first slots.
-        s0[0] = acc_a;
-        s1[0] = acc_b;
-    }
-}
-
-/// Dual-lane sparse matrix-vector multiply for a width-12 state.
-///
-/// Same as the width-8 dual variant but with 12-element states.
-/// Uses loops instead of full unrolling since width 12 is large
-/// enough that code size matters more than marginal scheduling gains.
-#[inline(always)]
-pub unsafe fn cheap_matmul_dual_asm_w12(
-    s0: &mut [u64; 12],
-    s1: &mut [u64; 12],
-    first_row: &[u64; 12],
-    v: &[u64; 12],
-) {
-    unsafe {
-        // Capture the original first elements from both lanes.
-        let old_a = s0[0];
-        let old_b = s1[0];
-
-        // Dot products: one per lane, interleaved.
-        let mut acc_a = mul_asm(s0[0], first_row[0]);
-        let mut acc_b = mul_asm(s1[0], first_row[0]);
-        for i in 1..12 {
-            acc_a = mul_add_asm(s0[i], first_row[i], acc_a);
-            acc_b = mul_add_asm(s1[i], first_row[i], acc_b);
-        }
-
-        // Tail updates: both lanes.
-        for i in 1..12 {
-            s0[i] = mul_add_asm(old_a, v[i - 1], s0[i]);
-            s1[i] = mul_add_asm(old_b, v[i - 1], s1[i]);
-        }
-
-        // Write the dot-product results into the first slots.
-        s0[0] = acc_a;
-        s1[0] = acc_b;
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Dense matrix-vector multiply (full-round linear layer)
-// ---------------------------------------------------------------------------
-
-/// Dense matrix-vector multiply for a width-8 state.
-///
-/// Computes `state = M * state` where M is a full 8x8 MDS matrix
-/// stored in row-major order. Used in the **full rounds** of the
-/// permutation where every element is mixed with every other.
-///
-/// Each output element is the dot product of one matrix row with the
-/// input vector. The input is snapshotted before any writes occur.
-pub fn dense_matmul_asm_w8(state: &mut [u64; 8], m: &[[u64; 8]; 8]) {
-    unsafe {
-        // Snapshot the current state so reads are not clobbered by writes.
-        let input = *state;
-
-        // Compute each output element as a dot product of one matrix
-        // row with the snapshotted input.
-        for i in 0..8 {
-            let mut acc = mul_asm(input[0], m[i][0]);
-            for j in 1..8 {
-                acc = mul_add_asm(input[j], m[i][j], acc);
-            }
-            state[i] = acc;
-        }
-    }
-}
-
-/// Dense matrix-vector multiply for a width-12 state.
-///
-/// Same as the width-8 variant but with a 12×12 MDS matrix.
-pub fn dense_matmul_asm_w12(state: &mut [u64; 12], m: &[[u64; 12]; 12]) {
-    unsafe {
-        // Snapshot the current state.
-        let input = *state;
-
-        // One dot product per output element.
-        for i in 0..12 {
-            let mut acc = mul_asm(input[0], m[i][0]);
-            for j in 1..12 {
-                acc = mul_add_asm(input[j], m[i][j], acc);
-            }
-            state[i] = acc;
-        }
-    }
-}
-
-/// Dual-lane dense matrix-vector multiply for a width-8 state.
-///
-/// Multiplies two independent state vectors by the same 8×8 matrix.
-/// Both lanes share the matrix but have their own input and output.
-///
-/// Interleaving the two dot-product chains per row hides latency.
-pub fn dense_matmul_dual_asm_w8(s0: &mut [u64; 8], s1: &mut [u64; 8], m: &[[u64; 8]; 8]) {
-    unsafe {
-        // Snapshot both input vectors.
-        let in0 = *s0;
-        let in1 = *s1;
-
-        // For each row, compute both dot products in lockstep.
-        for i in 0..8 {
-            let mut a = mul_asm(in0[0], m[i][0]);
-            let mut b = mul_asm(in1[0], m[i][0]);
-            for j in 1..8 {
-                a = mul_add_asm(in0[j], m[i][j], a);
-                b = mul_add_asm(in1[j], m[i][j], b);
-            }
-            s0[i] = a;
-            s1[i] = b;
-        }
-    }
-}
-
-/// Dual-lane dense matrix-vector multiply for a width-12 state.
-///
-/// Same as the width-8 dual variant but with a 12×12 matrix.
-pub fn dense_matmul_dual_asm_w12(s0: &mut [u64; 12], s1: &mut [u64; 12], m: &[[u64; 12]; 12]) {
-    unsafe {
-        // Snapshot both input vectors.
-        let in0 = *s0;
-        let in1 = *s1;
-
-        // For each row, compute both dot products in lockstep.
-        for i in 0..12 {
-            let mut a = mul_asm(in0[0], m[i][0]);
-            let mut b = mul_asm(in1[0], m[i][0]);
-            for j in 1..12 {
-                a = mul_add_asm(in0[j], m[i][j], a);
-                b = mul_add_asm(in1[j], m[i][j], b);
-            }
-            s0[i] = a;
-            s1[i] = b;
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Round-constant addition
-// ---------------------------------------------------------------------------
-
-/// Add round constants to every element of the state.
-///
-/// This is the first step of every Poseidon1 round. Each element
-/// receives its own constant, added in the Goldilocks field.
-///
-/// Generic over the state width to work with both width-8 and width-12.
-#[inline(always)]
-pub unsafe fn add_rc_asm<const WIDTH: usize>(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) {
-    unsafe {
-        // Element-wise modular addition.
-        for i in 0..WIDTH {
-            state[i] = add_asm(state[i], rc[i]);
-        }
-    }
-}
-
-/// Dual-lane round-constant addition.
-///
-/// Adds the same constants to two independent states. Both lanes
-/// share the constants because they are at the same round position.
-#[inline(always)]
-pub unsafe fn add_rc_dual_asm<const WIDTH: usize>(
-    s0: &mut [u64; WIDTH],
-    s1: &mut [u64; WIDTH],
-    rc: &[u64; WIDTH],
-) {
-    unsafe {
-        // Both lanes receive the same constant at each position.
-        for i in 0..WIDTH {
-            s0[i] = add_asm(s0[i], rc[i]);
-            s1[i] = add_asm(s1[i], rc[i]);
-        }
-    }
-}
-
-/// Add a single round constant to the first element only.
-///
-/// Used in partial rounds where only the first element enters the
-/// S-box and thus only needs its own constant added.
-#[inline(always)]
-pub unsafe fn add_scalar_s0_asm(state: &mut [u64], rc: u64) {
-    unsafe {
-        // Only the first element is modified.
-        state[0] = add_asm(state[0], rc);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field::PrimeField64;
-    use proptest::prelude::*;
-    use rand::SeedableRng;
-    use rand::rngs::SmallRng;
-
-    use super::*;
-    use crate::Goldilocks;
-
-    type F = Goldilocks;
-
-    /// Reduce a raw `u64` to its canonical Goldilocks representative.
-    ///
-    /// Wraps the value into a field element and extracts the unique
-    /// representative in `[0, P)`. This is the single source of truth
-    /// for comparing ASM outputs (which may carry unreduced values)
-    /// against field-level references.
-    fn canon(x: u64) -> u64 {
-        F::new(x).as_canonical_u64()
-    }
-
-    proptest! {
-        // ================================================================
-        // S-box: first element raised to the 7th power
-        // ================================================================
-
-        /// Verify the single-lane S-box against a field-level reference.
-        ///
-        /// The reference computes x^7 step by step using field multiplication.
-        /// Only the first element should change; the rest must be untouched.
-        #[test]
-        fn test_sbox_s0_asm(vals in prop::array::uniform8(any::<u64>())) {
-            // Build the expected x^7 using the field multiplication chain.
-            let x = F::new(vals[0]);
-            let x2 = x * x;
-            let x3 = x2 * x;
-            let x4 = x2 * x2;
-            let expected_s0 = (x3 * x4).as_canonical_u64();
-
-            // Run the ASM version on a copy.
-            let mut state = vals;
-            unsafe { sbox_s0_asm(&mut state); }
-
-            // The first element must match x^7.
-            prop_assert_eq!(canon(state[0]), expected_s0);
-
-            // Every other element must be unchanged.
-            for i in 1..8 {
-                prop_assert_eq!(state[i], vals[i]);
-            }
-        }
-
-        /// Verify the dual-lane S-box matches two independent single-lane calls.
-        ///
-        /// Runs the single-lane version on each lane separately as the
-        /// reference, then checks the dual-lane version produces the same.
-        #[test]
-        fn test_sbox_s0_dual_asm(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Build the reference by running single-lane on each lane.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                sbox_s0_asm(&mut ref0);
-                sbox_s0_asm(&mut ref1);
-            }
-
-            // Run the dual-lane version.
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { sbox_s0_dual_asm(&mut s0, &mut s1); }
-
-            // Both first elements must match their reference.
-            prop_assert_eq!(canon(s0[0]), canon(ref0[0]));
-            prop_assert_eq!(canon(s1[0]), canon(ref1[0]));
-
-            // All other elements must be unchanged.
-            for i in 1..8 {
-                prop_assert_eq!(s0[i], vals0[i]);
-                prop_assert_eq!(s1[i], vals1[i]);
-            }
-        }
-
-        // ================================================================
-        // Round-constant addition: element-wise field addition
-        // ================================================================
-
-        /// Verify round-constant addition (width 8) against field addition.
-        ///
-        /// Each element should equal the field sum of the original value
-        /// and its corresponding round constant.
-        #[test]
-        fn test_add_rc_asm_w8(
-            vals in prop::array::uniform8(any::<u64>()),
-            rc in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Build the expected result using field addition.
-            let expected: [u64; 8] = core::array::from_fn(|i| {
-                (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64()
-            });
-
-            // Run the ASM version.
-            let mut state = vals;
-            unsafe { add_rc_asm(&mut state, &rc); }
-
-            // Every element must match.
-            for i in 0..8 {
-                prop_assert_eq!(canon(state[i]), expected[i]);
-            }
-        }
-
-        /// Same verification for width 12.
-        #[test]
-        fn test_add_rc_asm_w12(
-            vals in prop::array::uniform12(any::<u64>()),
-            rc in prop::array::uniform12(any::<u64>()),
-        ) {
-            let expected: [u64; 12] = core::array::from_fn(|i| {
-                (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64()
-            });
-
-            let mut state = vals;
-            unsafe { add_rc_asm(&mut state, &rc); }
-
-            for i in 0..12 {
-                prop_assert_eq!(canon(state[i]), expected[i]);
-            }
-        }
-
-        /// Verify dual-lane round-constant addition (width 8) matches
-        /// two independent single-lane calls.
-        #[test]
-        fn test_add_rc_dual_asm_w8(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-            rc in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Reference: single-lane on each independently.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                add_rc_asm(&mut ref0, &rc);
-                add_rc_asm(&mut ref1, &rc);
-            }
-
-            // Run the dual-lane version.
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); }
-
-            // Both lanes must match their references.
-            for i in 0..8 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-
-        /// Same dual-lane verification for width 12.
-        #[test]
-        fn test_add_rc_dual_asm_w12(
-            vals0 in prop::array::uniform12(any::<u64>()),
-            vals1 in prop::array::uniform12(any::<u64>()),
-            rc in prop::array::uniform12(any::<u64>()),
-        ) {
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                add_rc_asm(&mut ref0, &rc);
-                add_rc_asm(&mut ref1, &rc);
-            }
-
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); }
-
-            for i in 0..12 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-
-        // ================================================================
-        // Scalar addition: first element only
-        // ================================================================
-
-        /// Verify that adding a scalar to the first element matches
-        /// field addition, and that all other elements are untouched.
-        #[test]
-        fn test_add_scalar_s0_asm(vals in prop::array::uniform8(any::<u64>()), rc: u64) {
-            // Expected: field sum of the first element and the constant.
-            let expected_s0 = (F::new(vals[0]) + F::new(rc)).as_canonical_u64();
-
-            // Run the ASM version.
-            let mut state = vals;
-            unsafe { add_scalar_s0_asm(&mut state, rc); }
-
-            // The first element must match.
-            prop_assert_eq!(canon(state[0]), expected_s0);
-
-            // Every other element must be unchanged.
-            for i in 1..8 {
-                prop_assert_eq!(state[i], vals[i]);
-            }
-        }
-
-        // ================================================================
-        // Sparse matrix-vector multiply (partial-round linear layer)
-        //
-        // The sparse matrix decomposes into:
-        //   new[0] = dot(first_row, state)
-        //   new[i] = state[i] + state[0] * v[i-1]   for i >= 1
-        // ================================================================
-
-        /// Verify the width-8 sparse matmul against a field-level reference.
-        ///
-        /// Builds the expected result by computing the dot product and
-        /// the per-element multiply-add using Goldilocks field operations.
-        #[test]
-        fn test_cheap_matmul_asm_w8(
-            vals in prop::array::uniform8(any::<u64>()),
-            first_row in prop::array::uniform8(any::<u64>()),
-            v in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Lift raw values into field elements.
-            let f: [F; 8] = vals.map(F::new);
-            let fr: [F; 8] = first_row.map(F::new);
-            let fv: [F; 8] = v.map(F::new);
-
-            // Capture the original first element.
-            let old_s0 = f[0];
-
-            // Dot product for the new first element.
-            let new_s0: F = (0..8).map(|i| f[i] * fr[i]).sum();
-
-            // Tail update for elements 1..8.
-            let mut expected = f;
-            for i in 1..8 {
-                expected[i] = f[i] + old_s0 * fv[i - 1];
-            }
-            expected[0] = new_s0;
-
-            // Run the ASM version.
-            let mut state = vals;
-            unsafe { cheap_matmul_asm_w8(&mut state, &first_row, &v); }
-
-            // Every element must match.
-            for i in 0..8 {
-                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
-            }
-        }
-
-        /// Same verification for width 12.
-        #[test]
-        fn test_cheap_matmul_asm_w12(
-            vals in prop::array::uniform12(any::<u64>()),
-            first_row in prop::array::uniform12(any::<u64>()),
-            v in prop::array::uniform12(any::<u64>()),
-        ) {
-            let f: [F; 12] = vals.map(F::new);
-            let fr: [F; 12] = first_row.map(F::new);
-            let fv: [F; 12] = v.map(F::new);
-
-            let old_s0 = f[0];
-            let new_s0: F = (0..12).map(|i| f[i] * fr[i]).sum();
-
-            let mut expected = f;
-            for i in 1..12 {
-                expected[i] = f[i] + old_s0 * fv[i - 1];
-            }
-            expected[0] = new_s0;
-
-            let mut state = vals;
-            unsafe { cheap_matmul_asm_w12(&mut state, &first_row, &v); }
-
-            for i in 0..12 {
-                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
-            }
-        }
-
-        /// Verify the width-8 dual-lane sparse matmul matches two
-        /// independent single-lane calls.
-        #[test]
-        fn test_cheap_matmul_dual_asm_w8(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-            first_row in prop::array::uniform8(any::<u64>()),
-            v in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Reference: single-lane on each independently.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                cheap_matmul_asm_w8(&mut ref0, &first_row, &v);
-                cheap_matmul_asm_w8(&mut ref1, &first_row, &v);
-            }
-
-            // Run the dual-lane version.
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { cheap_matmul_dual_asm_w8(&mut s0, &mut s1, &first_row, &v); }
-
-            // Both lanes must match their references.
-            for i in 0..8 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-
-        /// Same dual-lane verification for width 12.
-        #[test]
-        fn test_cheap_matmul_dual_asm_w12(
-            vals0 in prop::array::uniform12(any::<u64>()),
-            vals1 in prop::array::uniform12(any::<u64>()),
-            first_row in prop::array::uniform12(any::<u64>()),
-            v in prop::array::uniform12(any::<u64>()),
-        ) {
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                cheap_matmul_asm_w12(&mut ref0, &first_row, &v);
-                cheap_matmul_asm_w12(&mut ref1, &first_row, &v);
-            }
-
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { cheap_matmul_dual_asm_w12(&mut s0, &mut s1, &first_row, &v); }
-
-            for i in 0..12 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-
-        // ================================================================
-        // Dense matrix-vector multiply (full-round linear layer)
-        // ================================================================
-
-        /// Verify the width-8 dense matmul against a field-level reference.
-        ///
-        /// Each output element is the dot product of one matrix row with
-        /// the input vector. The matrix is fixed from a deterministic seed.
-        #[test]
-        fn test_dense_matmul_asm_w8(vals in prop::array::uniform8(any::<u64>())) {
-            // Fixed matrix from a deterministic seed.
-            let mut rng = SmallRng::seed_from_u64(42);
-            let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng);
-
-            // Reference: standard matrix-vector product using field ops.
-            let f: [F; 8] = vals.map(F::new);
-            let expected: [F; 8] = core::array::from_fn(|i| {
-                (0..8).map(|j| f[j] * F::new(m[i][j])).sum()
-            });
-
-            // Run the ASM version.
-            let mut state = vals;
-            dense_matmul_asm_w8(&mut state, &m);
-
-            // Every element must match.
-            for i in 0..8 {
-                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
-            }
-        }
-
-        /// Same verification for width 12.
-        #[test]
-        fn test_dense_matmul_asm_w12(vals in prop::array::uniform12(any::<u64>())) {
-            let mut rng = SmallRng::seed_from_u64(43);
-            let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng);
-
-            let f: [F; 12] = vals.map(F::new);
-            let expected: [F; 12] = core::array::from_fn(|i| {
-                (0..12).map(|j| f[j] * F::new(m[i][j])).sum()
-            });
-
-            let mut state = vals;
-            dense_matmul_asm_w12(&mut state, &m);
-
-            for i in 0..12 {
-                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
-            }
-        }
-
-        /// Verify the width-8 dual-lane dense matmul matches two
-        /// independent single-lane calls.
-        #[test]
-        fn test_dense_matmul_dual_asm_w8(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Fixed matrix from a deterministic seed.
-            let mut rng = SmallRng::seed_from_u64(44);
-            let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng);
-
-            // Reference: single-lane on each independently.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            dense_matmul_asm_w8(&mut ref0, &m);
-            dense_matmul_asm_w8(&mut ref1, &m);
-
-            // Run the dual-lane version.
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            dense_matmul_dual_asm_w8(&mut s0, &mut s1, &m);
-
-            // Both lanes must match their references.
-            for i in 0..8 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-
-        /// Same dual-lane verification for width 12.
-        #[test]
-        fn test_dense_matmul_dual_asm_w12(
-            vals0 in prop::array::uniform12(any::<u64>()),
-            vals1 in prop::array::uniform12(any::<u64>()),
-        ) {
-            let mut rng = SmallRng::seed_from_u64(45);
-            let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng);
-
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            dense_matmul_asm_w12(&mut ref0, &m);
-            dense_matmul_asm_w12(&mut ref1, &m);
-
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            dense_matmul_dual_asm_w12(&mut s0, &mut s1, &m);
-
-            for i in 0..12 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs
deleted file mode 100644
index cf74b4df8..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs
+++ /dev/null
@@ -1,652 +0,0 @@
-//! Optimized Poseidon2 for Goldilocks on aarch64.
-//!
-//! Uses ARM inline assembly with latency hiding via interleaved S-box/MDS computation.
-//! Fully unrolled internal rounds for W8, W12, W16.
-//!
-//! For packed operations, lanes are extracted to scalar, processed with interleaved
-//! dual-lane ASM, then repacked. This is faster than using PackedGoldilocksNeon
-//! arithmetic directly because the scalar `add_asm` avoids the modular reduction
-//! overhead present in NEON addition.
-
-use alloc::vec::Vec;
-
-use p3_poseidon2::{
-    ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, InternalLayer,
-    InternalLayerConstructor, poseidon2_round_numbers_128,
-};
-use p3_symmetric::{CryptographicPermutation, Permutation};
-use rand::distr::{Distribution, StandardUniform};
-use rand::{Rng, RngExt};
-
-use super::packing::PackedGoldilocksNeon;
-use super::poseidon2_asm::*;
-use super::utils::{pack_lanes, unpack_lanes};
-use crate::{Goldilocks, MATRIX_DIAG_20_GOLDILOCKS};
-
-/// Degree of the chosen permutation polynomial for Goldilocks.
-const GOLDILOCKS_S_BOX_DEGREE: u64 = 7;
-
-/// ASM-optimized internal layer with split-state s0-in-register, pre-converted constants.
-#[derive(Debug, Default, Clone)]
-pub struct Poseidon2InternalLayerGoldilocksAsm {
-    constants_raw: Vec<u64>,
-}
-
-impl InternalLayerConstructor<Goldilocks> for Poseidon2InternalLayerGoldilocksAsm {
-    fn new_from_constants(internal_constants: Vec<Goldilocks>) -> Self {
-        let constants_raw = internal_constants.iter().map(|c| c.value).collect();
-        Self { constants_raw }
-    }
-}
-
-const DIAG_RAW_20: [u64; 20] = {
-    let mut arr = [0u64; 20];
-    let mut i = 0;
-    while i < 20 {
-        arr[i] = MATRIX_DIAG_20_GOLDILOCKS[i].value;
-        i += 1;
-    }
-    arr
-};
-
-impl InternalLayer<Goldilocks, 8, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocksAsm {
-    fn permute_state(&self, state: &mut [Goldilocks; 8]) {
-        let state_raw: &mut [u64; 8] =
-            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
-        internal_permute_state_asm_w8(state_raw, &self.constants_raw);
-    }
-}
-
-impl InternalLayer<Goldilocks, 12, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2InternalLayerGoldilocksAsm
-{
-    fn permute_state(&self, state: &mut [Goldilocks; 12]) {
-        let state_raw: &mut [u64; 12] =
-            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
-        internal_permute_state_asm_w12(state_raw, &self.constants_raw);
-    }
-}
-
-impl InternalLayer<Goldilocks, 16, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2InternalLayerGoldilocksAsm
-{
-    fn permute_state(&self, state: &mut [Goldilocks; 16]) {
-        let state_raw: &mut [u64; 16] =
-            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
-        internal_permute_state_asm_w16(state_raw, &self.constants_raw);
-    }
-}
-
-impl InternalLayer<Goldilocks, 20, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2InternalLayerGoldilocksAsm
-{
-    fn permute_state(&self, state: &mut [Goldilocks; 20]) {
-        let state_raw: &mut [u64; 20] =
-            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
-        internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.constants_raw);
-    }
-}
-
-#[derive(Clone)]
-pub struct Poseidon2ExternalLayerGoldilocksAsm<const WIDTH: usize> {
-    initial_constants_raw: Vec<[u64; WIDTH]>,
-    terminal_constants_raw: Vec<[u64; WIDTH]>,
-}
-
-impl<const WIDTH: usize> ExternalLayerConstructor<Goldilocks, WIDTH>
-    for Poseidon2ExternalLayerGoldilocksAsm<WIDTH>
-{
-    fn new_from_constants(external_constants: ExternalLayerConstants<Goldilocks, WIDTH>) -> Self {
-        let initial_constants_raw = external_constants
-            .get_initial_constants()
-            .iter()
-            .map(|rc| core::array::from_fn(|i| rc[i].value))
-            .collect();
-        let terminal_constants_raw = external_constants
-            .get_terminal_constants()
-            .iter()
-            .map(|rc| core::array::from_fn(|i| rc[i].value))
-            .collect();
-        Self {
-            initial_constants_raw,
-            terminal_constants_raw,
-        }
-    }
-}
-
-impl ExternalLayer<Goldilocks, 8, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<8>
-{
-    fn permute_state_initial(&self, state: &mut [Goldilocks; 8]) {
-        let state_raw: &mut [u64; 8] =
-            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
-        external_initial_permute_w8(state_raw, &self.initial_constants_raw);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [Goldilocks; 8]) {
-        let state_raw: &mut [u64; 8] =
-            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
-        external_terminal_permute_w8(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-impl ExternalLayer<Goldilocks, 12, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<12>
-{
-    fn permute_state_initial(&self, state: &mut [Goldilocks; 12]) {
-        let state_raw: &mut [u64; 12] =
-            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
-        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [Goldilocks; 12]) {
-        let state_raw: &mut [u64; 12] =
-            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
-        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-impl ExternalLayer<Goldilocks, 16, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<16>
-{
-    fn permute_state_initial(&self, state: &mut [Goldilocks; 16]) {
-        let state_raw: &mut [u64; 16] =
-            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
-        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [Goldilocks; 16]) {
-        let state_raw: &mut [u64; 16] =
-            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
-        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-impl ExternalLayer<Goldilocks, 20, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<20>
-{
-    fn permute_state_initial(&self, state: &mut [Goldilocks; 20]) {
-        let state_raw: &mut [u64; 20] =
-            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
-        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [Goldilocks; 20]) {
-        let state_raw: &mut [u64; 20] =
-            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
-        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-/// Type alias for scalar ASM-optimized Poseidon2.
-pub type Poseidon2GoldilocksAsm<const WIDTH: usize> = p3_poseidon2::Poseidon2<
-    Goldilocks,
-    Poseidon2ExternalLayerGoldilocksAsm<WIDTH>,
-    Poseidon2InternalLayerGoldilocksAsm,
-    WIDTH,
-    GOLDILOCKS_S_BOX_DEGREE,
->;
-
-impl InternalLayer<PackedGoldilocksNeon, 8, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2InternalLayerGoldilocksAsm
-{
-    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 8]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl InternalLayer<PackedGoldilocksNeon, 12, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2InternalLayerGoldilocksAsm
-{
-    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 12]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        internal_permute_split_dual_w12(&mut lane0, &mut lane1, &self.constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl InternalLayer<PackedGoldilocksNeon, 16, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2InternalLayerGoldilocksAsm
-{
-    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 16]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        internal_permute_split_dual_w16(&mut lane0, &mut lane1, &self.constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl InternalLayer<PackedGoldilocksNeon, 20, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2InternalLayerGoldilocksAsm
-{
-    fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 20]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        internal_permute_split_dual(&mut lane0, &mut lane1, &DIAG_RAW_20, &self.constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl ExternalLayer<PackedGoldilocksNeon, 8, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<8>
-{
-    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 8]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 8]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl ExternalLayer<PackedGoldilocksNeon, 12, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<12>
-{
-    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 12]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 12]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl ExternalLayer<PackedGoldilocksNeon, 16, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<16>
-{
-    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 16]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 16]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl ExternalLayer<PackedGoldilocksNeon, 20, GOLDILOCKS_S_BOX_DEGREE>
-    for Poseidon2ExternalLayerGoldilocksAsm<20>
-{
-    fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 20]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-
-    fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 20]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-/// Fused Poseidon2 permutation for Goldilocks.
-///
-/// Instead of unpacking/packing between each of the 3 phases (initial external,
-/// internal, terminal external), this performs a single unpack at the start and
-/// a single pack at the end, eliminating the redundant lane conversions per
-/// packed permutation.
-#[derive(Clone, Debug)]
-pub struct Poseidon2GoldilocksFused<const WIDTH: usize> {
-    internal_constants_raw: Vec<u64>,
-    initial_constants_raw: Vec<[u64; WIDTH]>,
-    terminal_constants_raw: Vec<[u64; WIDTH]>,
-}
-
-impl<const WIDTH: usize> Poseidon2GoldilocksFused<WIDTH> {
-    pub fn new(
-        external_constants: &ExternalLayerConstants<Goldilocks, WIDTH>,
-        internal_constants: &[Goldilocks],
-    ) -> Self {
-        let internal_constants_raw = internal_constants.iter().map(|c| c.value).collect();
-        let initial_constants_raw = external_constants
-            .get_initial_constants()
-            .iter()
-            .map(|rc| core::array::from_fn(|i| rc[i].value))
-            .collect();
-        let terminal_constants_raw = external_constants
-            .get_terminal_constants()
-            .iter()
-            .map(|rc| core::array::from_fn(|i| rc[i].value))
-            .collect();
-        Self {
-            internal_constants_raw,
-            initial_constants_raw,
-            terminal_constants_raw,
-        }
-    }
-
-    pub fn new_from_rng<R: Rng>(rounds_f: usize, rounds_p: usize, rng: &mut R) -> Self
-    where
-        StandardUniform: Distribution<Goldilocks> + Distribution<[Goldilocks; WIDTH]>,
-    {
-        let external_constants = ExternalLayerConstants::new_from_rng(rounds_f, rng);
-        let internal_constants = rng
-            .sample_iter(StandardUniform)
-            .take(rounds_p)
-            .collect::<Vec<_>>();
-        Self::new(&external_constants, &internal_constants)
-    }
-
-    pub fn new_from_rng_128<R: Rng>(rng: &mut R) -> Self
-    where
-        StandardUniform: Distribution<Goldilocks> + Distribution<[Goldilocks; WIDTH]>,
-    {
-        let round_numbers =
-            poseidon2_round_numbers_128::<Goldilocks>(WIDTH, GOLDILOCKS_S_BOX_DEGREE);
-        let (rounds_f, rounds_p) = round_numbers.unwrap_or_else(|e| panic!("{e}"));
-        Self::new_from_rng(rounds_f, rounds_p, rng)
-    }
-}
-
-impl Permutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 8]) {
-        let state_raw: &mut [u64; 8] =
-            unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) };
-        external_initial_permute_w8(state_raw, &self.initial_constants_raw);
-        internal_permute_state_asm_w8(state_raw, &self.internal_constants_raw);
-        external_terminal_permute_w8(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> {}
-
-impl Permutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 12]) {
-        let state_raw: &mut [u64; 12] =
-            unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) };
-        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
-        internal_permute_state_asm_w12(state_raw, &self.internal_constants_raw);
-        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> {}
-
-impl Permutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 16]) {
-        let state_raw: &mut [u64; 16] =
-            unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) };
-        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
-        internal_permute_state_asm_w16(state_raw, &self.internal_constants_raw);
-        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> {}
-
-impl Permutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> {
-    fn permute_mut(&self, state: &mut [Goldilocks; 20]) {
-        let state_raw: &mut [u64; 20] =
-            unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) };
-        external_initial_permute_state_asm(state_raw, &self.initial_constants_raw);
-        internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.internal_constants_raw);
-        external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw);
-    }
-}
-
-impl CryptographicPermutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> {}
-
-impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw);
-        internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.internal_constants_raw);
-        external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> {}
-
-impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        let mut sv = lanes_to_neon(&lane0, &lane1);
-        external_initial_neon(&mut sv, &self.initial_constants_raw);
-        internal_permute_neon_w12(&mut sv, &self.internal_constants_raw);
-        external_terminal_neon(&mut sv, &self.terminal_constants_raw);
-        neon_to_lanes(&sv, &mut lane0, &mut lane1);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> {}
-
-impl Permutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 16]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        let mut sv = lanes_to_neon(&lane0, &lane1);
-        external_initial_neon(&mut sv, &self.initial_constants_raw);
-        internal_permute_neon_w16(&mut sv, &self.internal_constants_raw);
-        external_terminal_neon(&mut sv, &self.terminal_constants_raw);
-        neon_to_lanes(&sv, &mut lane0, &mut lane1);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> {}
-
-impl Permutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> {
-    fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 20]) {
-        let (mut lane0, mut lane1) = unpack_lanes(state);
-        let mut sv = lanes_to_neon(&lane0, &lane1);
-        external_initial_neon(&mut sv, &self.initial_constants_raw);
-        internal_permute_neon(&mut sv, &DIAG_RAW_20, &self.internal_constants_raw);
-        external_terminal_neon(&mut sv, &self.terminal_constants_raw);
-        neon_to_lanes(&sv, &mut lane0, &mut lane1);
-        pack_lanes(state, &lane0, &lane1);
-    }
-}
-
-impl CryptographicPermutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> {}
-
-#[cfg(test)]
-mod tests {
-    use p3_field::{PrimeCharacteristicRing, PrimeField64};
-    use p3_poseidon2::{ExternalLayerConstants, InternalLayer, Poseidon2};
-    use p3_symmetric::Permutation;
-    use rand::rngs::SmallRng;
-    use rand::{RngExt, SeedableRng};
-
-    use super::*;
-    use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE;
-    use crate::{
-        GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8,
-        Poseidon2ExternalLayerGoldilocks, Poseidon2InternalLayerGoldilocks,
-    };
-
-    type F = Goldilocks;
-
-    // Test that fully ASM-optimized implementation matches generic scalar
-    fn test_asm_matches_generic<const WIDTH: usize>()
-    where
-        Poseidon2InternalLayerGoldilocks: InternalLayer<F, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
-        Poseidon2InternalLayerGoldilocksAsm: InternalLayer<F, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
-        Poseidon2ExternalLayerGoldilocksAsm<WIDTH>:
-            ExternalLayer<Goldilocks, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
-    {
-        let mut rng = SmallRng::seed_from_u64(42);
-
-        let external_constants = ExternalLayerConstants::<Goldilocks, WIDTH>::new_from_rng(
-            2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS,
-            &mut rng,
-        );
-        let internal_constants: Vec<Goldilocks> = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8)
-            .map(|_| F::from_u64(rng.random()))
-            .collect();
-
-        // Generic scalar implementation
-        let generic_poseidon2: Poseidon2<
-            Goldilocks,
-            Poseidon2ExternalLayerGoldilocks<WIDTH>,
-            Poseidon2InternalLayerGoldilocks,
-            WIDTH,
-            GOLDILOCKS_S_BOX_DEGREE,
-        > = Poseidon2::new(external_constants.clone(), internal_constants.clone());
-
-        // Fully ASM-optimized implementation
-        let asm_poseidon2: Poseidon2GoldilocksAsm<WIDTH> =
-            Poseidon2::new(external_constants, internal_constants);
-
-        // Test with zeros
-        let mut generic_input = [F::ZERO; WIDTH];
-        let mut asm_input = [F::ZERO; WIDTH];
-
-        generic_poseidon2.permute_mut(&mut generic_input);
-        asm_poseidon2.permute_mut(&mut asm_input);
-
-        for i in 0..WIDTH {
-            assert_eq!(
-                asm_input[i].as_canonical_u64(),
-                generic_input[i].as_canonical_u64(),
-                "ASM mismatch at index {i} for zero input"
-            );
-        }
-
-        // Test with random input
-        let mut generic_input: [F; WIDTH] = core::array::from_fn(|_| F::from_u64(rng.random()));
-        let mut asm_input = generic_input;
-
-        generic_poseidon2.permute_mut(&mut generic_input);
-        asm_poseidon2.permute_mut(&mut asm_input);
-
-        for i in 0..WIDTH {
-            assert_eq!(
-                asm_input[i].as_canonical_u64(),
-                generic_input[i].as_canonical_u64(),
-                "ASM mismatch at index {i} for random input"
-            );
-        }
-    }
-
-    fn test_fused_matches_generic<const WIDTH: usize>()
-    where
-        Poseidon2InternalLayerGoldilocks: InternalLayer<F, WIDTH, GOLDILOCKS_S_BOX_DEGREE>,
-        Poseidon2GoldilocksFused<WIDTH>:
-            Permutation<[F; WIDTH]> + Permutation<[PackedGoldilocksNeon; WIDTH]>,
-    {
-        let mut rng = SmallRng::seed_from_u64(42);
-
-        let external_constants = ExternalLayerConstants::<Goldilocks, WIDTH>::new_from_rng(
-            2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS,
-            &mut rng,
-        );
-        let internal_constants: Vec<Goldilocks> = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8)
-            .map(|_| rng.random())
-            .collect();
-
-        let generic_poseidon2: Poseidon2<
-            Goldilocks,
-            Poseidon2ExternalLayerGoldilocks<WIDTH>,
-            Poseidon2InternalLayerGoldilocks,
-            WIDTH,
-            GOLDILOCKS_S_BOX_DEGREE,
-        > = Poseidon2::new(external_constants.clone(), internal_constants.clone());
-
-        let fused =
-            Poseidon2GoldilocksFused::<WIDTH>::new(&external_constants, &internal_constants);
-
-        // Scalar: fused vs generic
-        let mut generic_input = [F::ZERO; WIDTH];
-        let mut fused_input = [F::ZERO; WIDTH];
-        generic_poseidon2.permute_mut(&mut generic_input);
-        fused.permute_mut(&mut fused_input);
-        for i in 0..WIDTH {
-            assert_eq!(
-                fused_input[i].as_canonical_u64(),
-                generic_input[i].as_canonical_u64(),
-                "Fused scalar mismatch at index {i} for zero input"
-            );
-        }
-
-        let mut generic_input: [F; WIDTH] = rng.random();
-        let mut fused_input = generic_input;
-        generic_poseidon2.permute_mut(&mut generic_input);
-        fused.permute_mut(&mut fused_input);
-        for i in 0..WIDTH {
-            assert_eq!(
-                fused_input[i].as_canonical_u64(),
-                generic_input[i].as_canonical_u64(),
-                "Fused scalar mismatch at index {i} for random input"
-            );
-        }
-
-        // Packed: fused packed vs scalar (each packed lane should match scalar)
-        let scalar_a: [F; WIDTH] = rng.random();
-        let scalar_b: [F; WIDTH] = rng.random();
-
-        let mut packed_input: [PackedGoldilocksNeon; WIDTH] =
-            core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]]));
-        fused.permute_mut(&mut packed_input);
-
-        let mut expected_a = scalar_a;
-        let mut expected_b = scalar_b;
-        fused.permute_mut(&mut expected_a);
-        fused.permute_mut(&mut expected_b);
-
-        for i in 0..WIDTH {
-            assert_eq!(
-                packed_input[i].0[0].as_canonical_u64(),
-                expected_a[i].as_canonical_u64(),
-                "Fused packed lane0 mismatch at index {i}"
-            );
-            assert_eq!(
-                packed_input[i].0[1].as_canonical_u64(),
-                expected_b[i].as_canonical_u64(),
-                "Fused packed lane1 mismatch at index {i}"
-            );
-        }
-    }
-
-    #[test]
-    fn test_asm_matches_generic_width_8() {
-        test_asm_matches_generic::<8>();
-    }
-
-    #[test]
-    fn test_asm_matches_generic_width_12() {
-        test_asm_matches_generic::<12>();
-    }
-
-    #[test]
-    fn test_asm_matches_generic_width_16() {
-        test_asm_matches_generic::<16>();
-    }
-
-    #[test]
-    fn test_asm_matches_generic_width_20() {
-        test_asm_matches_generic::<20>();
-    }
-
-    #[test]
-    fn test_fused_matches_generic_width_8() {
-        test_fused_matches_generic::<8>();
-    }
-
-    #[test]
-    fn test_fused_matches_generic_width_12() {
-        test_fused_matches_generic::<12>();
-    }
-
-    #[test]
-    fn test_fused_matches_generic_width_16() {
-        test_fused_matches_generic::<16>();
-    }
-
-    #[test]
-    fn test_fused_matches_generic_width_20() {
-        test_fused_matches_generic::<20>();
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs
deleted file mode 100644
index 00b7fdc57..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs
+++ /dev/null
@@ -1,2621 +0,0 @@
-//! ARM assembly primitives for Poseidon2 on Goldilocks.
-//!
-//! Latency hiding: ARM mul/umulh have ~4-5 cycle latency. By interleaving
-//! S-box computation with MDS operations, we hide much of this latency.
-
-use core::arch::aarch64::*;
-use core::arch::asm;
-
-use super::utils::{add_asm, mul_add_asm, mul_asm};
-use crate::P;
-
-/// Compute x / 2 in the Goldilocks field, matching `halve_u64::<P>`.
-#[inline(always)]
-unsafe fn div2_asm(x: u64) -> u64 {
-    let shift = (P + 1) >> 1;
-    let result: u64;
-    let _tmp: u64;
-
-    unsafe {
-        asm!(
-            // result = x >> 1
-            "lsr   {result}, {x}, #1",
-            // tmp = x & 1
-            "and   {tmp}, {x}, #1",
-            // if tmp != 0 (x odd), tmp := shift, else tmp := 0
-            "cmp   {tmp}, #0",
-            "csel  {tmp}, {shift}, xzr, ne",
-            // result += tmp
-            "add   {result}, {result}, {tmp}",
-            x      = in(reg) x,
-            shift  = in(reg) shift,
-            tmp    = out(reg) _tmp,
-            result = out(reg) result,
-            options(pure, nomem, nostack),
-        );
-    }
-
-    result
-}
-
-#[inline(always)]
-unsafe fn div4_asm(x: u64) -> u64 {
-    unsafe { div2_asm(div2_asm(x)) }
-}
-
-#[inline(always)]
-unsafe fn div8_asm(x: u64) -> u64 {
-    unsafe { div2_asm(div4_asm(x)) }
-}
-
-#[inline(always)]
-unsafe fn div16_asm(x: u64) -> u64 {
-    unsafe { div2_asm(div8_asm(x)) }
-}
-
-#[inline(always)]
-unsafe fn div32_asm(x: u64) -> u64 {
-    unsafe { div4_asm(div8_asm(x)) }
-}
-
-/// Compute x * 2^{-32} mod P using the Goldilocks structure.
-///
-/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P).
-/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P,
-/// where x_hi = x >> 32, x_lo = x & 0xFFFFFFFF.
-#[inline(always)]
-unsafe fn div_2_32_asm(x: u64) -> u64 {
-    let result: u64;
-    let _hi: u64;
-    let _lo: u64;
-    let _t: u64;
-    let _sum: u64;
-    let _adj: u64;
-
-    unsafe {
-        asm!(
-            "lsr   {hi}, {x}, #32",
-            "and   {lo}, {x}, #0xFFFFFFFF",
-            "add   {sum}, {hi}, {lo}",
-            "lsl   {t}, {lo}, #32",
-            "subs  {result}, {sum}, {t}",
-            "csetm {adj:w}, cc",
-            "sub   {result}, {result}, {adj}",
-            x      = in(reg) x,
-            hi     = out(reg) _hi,
-            lo     = out(reg) _lo,
-            t      = out(reg) _t,
-            sum    = out(reg) _sum,
-            result = out(reg) result,
-            adj    = lateout(reg) _adj,
-            options(pure, nomem, nostack),
-        );
-    }
-
-    result
-}
-
-/// Subtract two Goldilocks elements with borrow handling using inline assembly.
-#[inline(always)]
-unsafe fn sub_asm(a: u64, b: u64) -> u64 {
-    let result: u64;
-    let _adj: u64;
-
-    unsafe {
-        asm!(
-            "subs  {result}, {a}, {b}",
-            "csetm {adj:w}, cc",
-            "sub   {result}, {result}, {adj}",
-            a = in(reg) a,
-            b = in(reg) b,
-            result = out(reg) result,
-            adj = out(reg) _adj,
-            options(pure, nomem, nostack),
-        );
-    }
-
-    result
-}
-
-/// Split-state generic internal permute: s0 stays in a register across all rounds.
-#[inline]
-#[allow(clippy::needless_range_loop)]
-pub fn internal_permute_state_asm<const WIDTH: usize>(
-    state: &mut [u64; WIDTH],
-    diag: &[u64; WIDTH],
-    constants: &[u64],
-) {
-    let mut s0 = state[0];
-    for &rc in constants {
-        unsafe {
-            s0 = add_asm(s0, rc);
-            let s0_2 = mul_asm(s0, s0);
-            let s0_3 = mul_asm(s0_2, s0);
-            let s0_4 = mul_asm(s0_2, s0_2);
-            s0 = mul_asm(s0_3, s0_4);
-
-            let mut sum_hi: u64 = 0;
-            for i in 1..WIDTH {
-                sum_hi = add_asm(sum_hi, state[i]);
-            }
-
-            let mut diag_muls: [u64; WIDTH] = [0; WIDTH];
-            for i in 1..WIDTH {
-                diag_muls[i] = mul_asm(state[i], diag[i]);
-            }
-
-            let sum = add_asm(sum_hi, s0);
-            s0 = mul_add_asm(s0, diag[0], sum);
-
-            for i in 1..WIDTH {
-                state[i] = add_asm(diag_muls[i], sum);
-            }
-        }
-    }
-    state[0] = s0;
-}
-
-/// Split-state generic dual-lane internal permute for packed processing.
-#[inline]
-#[allow(clippy::needless_range_loop)]
-pub fn internal_permute_split_dual<const WIDTH: usize>(
-    lane0: &mut [u64; WIDTH],
-    lane1: &mut [u64; WIDTH],
-    diag: &[u64; WIDTH],
-    constants: &[u64],
-) {
-    let mut s0_a = lane0[0];
-    let mut s0_b = lane1[0];
-    for &rc in constants {
-        unsafe {
-            s0_a = add_asm(s0_a, rc);
-            s0_b = add_asm(s0_b, rc);
-            let s0_2_a = mul_asm(s0_a, s0_a);
-            let s0_2_b = mul_asm(s0_b, s0_b);
-            let s0_3_a = mul_asm(s0_2_a, s0_a);
-            let s0_3_b = mul_asm(s0_2_b, s0_b);
-            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
-            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
-            s0_a = mul_asm(s0_3_a, s0_4_a);
-            s0_b = mul_asm(s0_3_b, s0_4_b);
-
-            let mut sum_hi_a: u64 = 0;
-            let mut sum_hi_b: u64 = 0;
-            for i in 1..WIDTH {
-                sum_hi_a = add_asm(sum_hi_a, lane0[i]);
-                sum_hi_b = add_asm(sum_hi_b, lane1[i]);
-            }
-
-            let mut diag_muls_a: [u64; WIDTH] = [0; WIDTH];
-            let mut diag_muls_b: [u64; WIDTH] = [0; WIDTH];
-            for i in 1..WIDTH {
-                diag_muls_a[i] = mul_asm(lane0[i], diag[i]);
-                diag_muls_b[i] = mul_asm(lane1[i], diag[i]);
-            }
-
-            let sum_a = add_asm(sum_hi_a, s0_a);
-            let sum_b = add_asm(sum_hi_b, s0_b);
-            s0_a = mul_add_asm(s0_a, diag[0], sum_a);
-            s0_b = mul_add_asm(s0_b, diag[0], sum_b);
-
-            for i in 1..WIDTH {
-                lane0[i] = add_asm(diag_muls_a[i], sum_a);
-                lane1[i] = add_asm(diag_muls_b[i], sum_b);
-            }
-        }
-    }
-    lane0[0] = s0_a;
-    lane1[0] = s0_b;
-}
-
-/// Split-state W8 internal permute: s0 stays in a register across all rounds.
-#[inline]
-pub fn internal_permute_state_asm_w8(state: &mut [u64; 8], constants: &[u64]) {
-    let mut s0 = state[0];
-    for &rc in constants {
-        unsafe {
-            s0 = add_asm(s0, rc);
-            let s0_2 = mul_asm(s0, s0);
-
-            let sum1 = add_asm(state[1], state[2]);
-            let sum2 = add_asm(state[3], state[4]);
-            let sum3 = add_asm(state[5], state[6]);
-
-            let s0_3 = mul_asm(s0_2, s0);
-            let s0_4 = mul_asm(s0_2, s0_2);
-
-            let sum12 = add_asm(sum1, sum2);
-            let sum37 = add_asm(sum3, state[7]);
-
-            let d1 = state[1];
-            let d2 = double_asm(state[2]);
-            let d3 = div2_asm(state[3]);
-            let d4 = add_asm(double_asm(state[4]), state[4]);
-
-            let sum_hi = add_asm(sum12, sum37);
-
-            let d5 = div2_asm(state[5]);
-            let d6 = add_asm(double_asm(state[6]), state[6]);
-            let d7 = double_asm(double_asm(state[7]));
-
-            s0 = mul_asm(s0_3, s0_4);
-            let sum = add_asm(sum_hi, s0);
-            // V[0]=-2: new_s0 = sum + (-2)*s0 = sum_hi + s0 - 2*s0 = sum_hi - s0
-            s0 = sub_asm(sum_hi, s0);
-
-            state[1] = add_asm(d1, sum);
-            state[2] = add_asm(d2, sum);
-            state[3] = add_asm(d3, sum);
-            state[4] = add_asm(d4, sum);
-            state[5] = sub_asm(sum, d5);
-            state[6] = sub_asm(sum, d6);
-            state[7] = sub_asm(sum, d7);
-        }
-    }
-    state[0] = s0;
-}
-
-/// Split-state dual-lane W8 internal permute for packed processing.
-#[inline]
-pub fn internal_permute_split_dual_w8(
-    lane0: &mut [u64; 8],
-    lane1: &mut [u64; 8],
-    constants: &[u64],
-) {
-    let mut s0_a = lane0[0];
-    let mut s0_b = lane1[0];
-    for &rc in constants {
-        unsafe {
-            s0_a = add_asm(s0_a, rc);
-            s0_b = add_asm(s0_b, rc);
-
-            let s0_2_a = mul_asm(s0_a, s0_a);
-            let s0_2_b = mul_asm(s0_b, s0_b);
-
-            let sum1_a = add_asm(lane0[1], lane0[2]);
-            let sum1_b = add_asm(lane1[1], lane1[2]);
-            let sum2_a = add_asm(lane0[3], lane0[4]);
-            let sum2_b = add_asm(lane1[3], lane1[4]);
-            let sum3_a = add_asm(lane0[5], lane0[6]);
-            let sum3_b = add_asm(lane1[5], lane1[6]);
-
-            let s0_3_a = mul_asm(s0_2_a, s0_a);
-            let s0_3_b = mul_asm(s0_2_b, s0_b);
-            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
-            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
-
-            let sum12_a = add_asm(sum1_a, sum2_a);
-            let sum12_b = add_asm(sum1_b, sum2_b);
-            let sum37_a = add_asm(sum3_a, lane0[7]);
-            let sum37_b = add_asm(sum3_b, lane1[7]);
-
-            let d1_a = lane0[1];
-            let d1_b = lane1[1];
-            let d2_a = double_asm(lane0[2]);
-            let d2_b = double_asm(lane1[2]);
-            let d3_a = div2_asm(lane0[3]);
-            let d3_b = div2_asm(lane1[3]);
-            let d4_a = add_asm(double_asm(lane0[4]), lane0[4]);
-            let d4_b = add_asm(double_asm(lane1[4]), lane1[4]);
-
-            let sum_hi_a = add_asm(sum12_a, sum37_a);
-            let sum_hi_b = add_asm(sum12_b, sum37_b);
-
-            let d5_a = div2_asm(lane0[5]);
-            let d5_b = div2_asm(lane1[5]);
-            let d6_a = add_asm(double_asm(lane0[6]), lane0[6]);
-            let d6_b = add_asm(double_asm(lane1[6]), lane1[6]);
-            let d7_a = double_asm(double_asm(lane0[7]));
-            let d7_b = double_asm(double_asm(lane1[7]));
-
-            s0_a = mul_asm(s0_3_a, s0_4_a);
-            s0_b = mul_asm(s0_3_b, s0_4_b);
-
-            let sum_a = add_asm(sum_hi_a, s0_a);
-            let sum_b = add_asm(sum_hi_b, s0_b);
-            s0_a = sub_asm(sum_hi_a, s0_a);
-            s0_b = sub_asm(sum_hi_b, s0_b);
-
-            lane0[1] = add_asm(d1_a, sum_a);
-            lane1[1] = add_asm(d1_b, sum_b);
-            lane0[2] = add_asm(d2_a, sum_a);
-            lane1[2] = add_asm(d2_b, sum_b);
-            lane0[3] = add_asm(d3_a, sum_a);
-            lane1[3] = add_asm(d3_b, sum_b);
-            lane0[4] = add_asm(d4_a, sum_a);
-            lane1[4] = add_asm(d4_b, sum_b);
-            lane0[5] = sub_asm(sum_a, d5_a);
-            lane1[5] = sub_asm(sum_b, d5_b);
-            lane0[6] = sub_asm(sum_a, d6_a);
-            lane1[6] = sub_asm(sum_b, d6_b);
-            lane0[7] = sub_asm(sum_a, d7_a);
-            lane1[7] = sub_asm(sum_b, d7_b);
-        }
-    }
-    lane0[0] = s0_a;
-    lane1[0] = s0_b;
-}
-
-/// Split-state W12 internal permute: s0 stays in a register across all rounds.
-#[inline]
-pub fn internal_permute_state_asm_w12(state: &mut [u64; 12], constants: &[u64]) {
-    let mut s0 = state[0];
-    for &rc in constants {
-        unsafe {
-            s0 = add_asm(s0, rc);
-            let s0_2 = mul_asm(s0, s0);
-
-            let sum1 = add_asm(state[1], state[2]);
-            let sum2 = add_asm(state[3], state[4]);
-            let sum3 = add_asm(state[5], state[6]);
-            let sum4 = add_asm(state[7], state[8]);
-            let sum5 = add_asm(state[9], state[10]);
-
-            let s0_3 = mul_asm(s0_2, s0);
-            let s0_4 = mul_asm(s0_2, s0_2);
-
-            let sum12 = add_asm(sum1, sum2);
-            let sum34 = add_asm(sum3, sum4);
-            let sum511 = add_asm(sum5, state[11]);
-
-            let d1 = state[1];
-            let d2 = double_asm(state[2]);
-            let d3 = div2_asm(state[3]);
-            let d4 = add_asm(double_asm(state[4]), state[4]);
-
-            let sum1234 = add_asm(sum12, sum34);
-
-            let d5 = double_asm(double_asm(state[5]));
-            let d6 = div2_asm(state[6]);
-            let d7 = add_asm(double_asm(state[7]), state[7]);
-            let d8 = double_asm(double_asm(state[8]));
-
-            let sum_hi = add_asm(sum1234, sum511);
-
-            let d9 = div4_asm(state[9]);
-            let d10 = div4_asm(state[10]);
-            let d11 = div8_asm(state[11]);
-
-            s0 = mul_asm(s0_3, s0_4);
-            let sum = add_asm(sum_hi, s0);
-            s0 = sub_asm(sum_hi, s0);
-
-            state[1] = add_asm(d1, sum);
-            state[2] = add_asm(d2, sum);
-            state[3] = add_asm(d3, sum);
-            state[4] = add_asm(d4, sum);
-            state[5] = add_asm(d5, sum);
-            state[6] = sub_asm(sum, d6);
-            state[7] = sub_asm(sum, d7);
-            state[8] = sub_asm(sum, d8);
-            state[9] = add_asm(d9, sum);
-            state[10] = sub_asm(sum, d10);
-            state[11] = add_asm(d11, sum);
-        }
-    }
-    state[0] = s0;
-}
-
-/// Split-state dual-lane W12 internal permute for packed processing.
-#[inline]
-pub fn internal_permute_split_dual_w12(
-    lane0: &mut [u64; 12],
-    lane1: &mut [u64; 12],
-    constants: &[u64],
-) {
-    let mut s0_a = lane0[0];
-    let mut s0_b = lane1[0];
-    for &rc in constants {
-        unsafe {
-            s0_a = add_asm(s0_a, rc);
-            s0_b = add_asm(s0_b, rc);
-
-            let s0_2_a = mul_asm(s0_a, s0_a);
-            let s0_2_b = mul_asm(s0_b, s0_b);
-
-            let sum1_a = add_asm(lane0[1], lane0[2]);
-            let sum1_b = add_asm(lane1[1], lane1[2]);
-            let sum2_a = add_asm(lane0[3], lane0[4]);
-            let sum2_b = add_asm(lane1[3], lane1[4]);
-            let sum3_a = add_asm(lane0[5], lane0[6]);
-            let sum3_b = add_asm(lane1[5], lane1[6]);
-            let sum4_a = add_asm(lane0[7], lane0[8]);
-            let sum4_b = add_asm(lane1[7], lane1[8]);
-            let sum5_a = add_asm(lane0[9], lane0[10]);
-            let sum5_b = add_asm(lane1[9], lane1[10]);
-
-            let s0_3_a = mul_asm(s0_2_a, s0_a);
-            let s0_3_b = mul_asm(s0_2_b, s0_b);
-            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
-            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
-
-            let sum12_a = add_asm(sum1_a, sum2_a);
-            let sum12_b = add_asm(sum1_b, sum2_b);
-            let sum34_a = add_asm(sum3_a, sum4_a);
-            let sum34_b = add_asm(sum3_b, sum4_b);
-            let sum511_a = add_asm(sum5_a, lane0[11]);
-            let sum511_b = add_asm(sum5_b, lane1[11]);
-
-            let d1_a = lane0[1];
-            let d1_b = lane1[1];
-            let d2_a = double_asm(lane0[2]);
-            let d2_b = double_asm(lane1[2]);
-            let d3_a = div2_asm(lane0[3]);
-            let d3_b = div2_asm(lane1[3]);
-            let d4_a = add_asm(double_asm(lane0[4]), lane0[4]);
-            let d4_b = add_asm(double_asm(lane1[4]), lane1[4]);
-
-            let sum1234_a = add_asm(sum12_a, sum34_a);
-            let sum1234_b = add_asm(sum12_b, sum34_b);
-
-            let d5_a = double_asm(double_asm(lane0[5]));
-            let d5_b = double_asm(double_asm(lane1[5]));
-            let d6_a = div2_asm(lane0[6]);
-            let d6_b = div2_asm(lane1[6]);
-            let d7_a = add_asm(double_asm(lane0[7]), lane0[7]);
-            let d7_b = add_asm(double_asm(lane1[7]), lane1[7]);
-            let d8_a = double_asm(double_asm(lane0[8]));
-            let d8_b = double_asm(double_asm(lane1[8]));
-
-            let sum_hi_a = add_asm(sum1234_a, sum511_a);
-            let sum_hi_b = add_asm(sum1234_b, sum511_b);
-
-            let d9_a = div4_asm(lane0[9]);
-            let d9_b = div4_asm(lane1[9]);
-            let d10_a = div4_asm(lane0[10]);
-            let d10_b = div4_asm(lane1[10]);
-            let d11_a = div8_asm(lane0[11]);
-            let d11_b = div8_asm(lane1[11]);
-
-            s0_a = mul_asm(s0_3_a, s0_4_a);
-            s0_b = mul_asm(s0_3_b, s0_4_b);
-
-            let sum_a = add_asm(sum_hi_a, s0_a);
-            let sum_b = add_asm(sum_hi_b, s0_b);
-            s0_a = sub_asm(sum_hi_a, s0_a);
-            s0_b = sub_asm(sum_hi_b, s0_b);
-
-            lane0[1] = add_asm(d1_a, sum_a);
-            lane1[1] = add_asm(d1_b, sum_b);
-            lane0[2] = add_asm(d2_a, sum_a);
-            lane1[2] = add_asm(d2_b, sum_b);
-            lane0[3] = add_asm(d3_a, sum_a);
-            lane1[3] = add_asm(d3_b, sum_b);
-            lane0[4] = add_asm(d4_a, sum_a);
-            lane1[4] = add_asm(d4_b, sum_b);
-            lane0[5] = add_asm(d5_a, sum_a);
-            lane1[5] = add_asm(d5_b, sum_b);
-            lane0[6] = sub_asm(sum_a, d6_a);
-            lane1[6] = sub_asm(sum_b, d6_b);
-            lane0[7] = sub_asm(sum_a, d7_a);
-            lane1[7] = sub_asm(sum_b, d7_b);
-            lane0[8] = sub_asm(sum_a, d8_a);
-            lane1[8] = sub_asm(sum_b, d8_b);
-            lane0[9] = add_asm(d9_a, sum_a);
-            lane1[9] = add_asm(d9_b, sum_b);
-            lane0[10] = sub_asm(sum_a, d10_a);
-            lane1[10] = sub_asm(sum_b, d10_b);
-            lane0[11] = add_asm(d11_a, sum_a);
-            lane1[11] = add_asm(d11_b, sum_b);
-        }
-    }
-    lane0[0] = s0_a;
-    lane1[0] = s0_b;
-}
-
-/// Split-state W16 internal permute: s0 stays in a register across all rounds.
-#[inline]
-pub fn internal_permute_state_asm_w16(state: &mut [u64; 16], constants: &[u64]) {
-    let mut s0 = state[0];
-    for &rc in constants {
-        unsafe {
-            s0 = add_asm(s0, rc);
-            let s0_2 = mul_asm(s0, s0);
-
-            let sum1 = add_asm(state[1], state[2]);
-            let sum2 = add_asm(state[3], state[4]);
-            let sum3 = add_asm(state[5], state[6]);
-            let sum4 = add_asm(state[7], state[8]);
-            let sum5 = add_asm(state[9], state[10]);
-            let sum6 = add_asm(state[11], state[12]);
-            let sum7 = add_asm(state[13], state[14]);
-
-            let s0_3 = mul_asm(s0_2, s0);
-            let s0_4 = mul_asm(s0_2, s0_2);
-
-            let sum12 = add_asm(sum1, sum2);
-            let sum34 = add_asm(sum3, sum4);
-            let sum56 = add_asm(sum5, sum6);
-            let sum715 = add_asm(sum7, state[15]);
-
-            let sum1234 = add_asm(sum12, sum34);
-            let sum56715 = add_asm(sum56, sum715);
-            let sum_hi = add_asm(sum1234, sum56715);
-
-            let d1 = state[1];
-            let d2 = double_asm(state[2]);
-            let d3 = div2_asm(state[3]);
-            let d4 = add_asm(double_asm(state[4]), state[4]);
-            let d5 = double_asm(double_asm(state[5]));
-            let d6 = div2_asm(state[6]);
-            let d7 = add_asm(double_asm(state[7]), state[7]);
-            let d8 = double_asm(double_asm(state[8]));
-
-            let d9 = div8_asm(state[9]);
-            let d10 = div16_asm(state[10]);
-            let d11 = div32_asm(state[11]);
-            let d12 = div8_asm(state[12]);
-            let d13 = div16_asm(state[13]);
-            let d14 = div32_asm(state[14]);
-            let d15 = div_2_32_asm(state[15]);
-
-            s0 = mul_asm(s0_3, s0_4);
-            let sum = add_asm(sum_hi, s0);
-            s0 = sub_asm(sum_hi, s0);
-
-            state[1] = add_asm(d1, sum);
-            state[2] = add_asm(d2, sum);
-            state[3] = add_asm(d3, sum);
-            state[4] = add_asm(d4, sum);
-            state[5] = add_asm(d5, sum);
-            state[6] = sub_asm(sum, d6);
-            state[7] = sub_asm(sum, d7);
-            state[8] = sub_asm(sum, d8);
-            state[9] = add_asm(d9, sum);
-            state[10] = add_asm(d10, sum);
-            state[11] = add_asm(d11, sum);
-            state[12] = sub_asm(sum, d12);
-            state[13] = sub_asm(sum, d13);
-            state[14] = sub_asm(sum, d14);
-            state[15] = add_asm(d15, sum);
-        }
-    }
-    state[0] = s0;
-}
-
-/// Split-state dual-lane W16 internal permute for packed processing.
-#[inline]
-pub fn internal_permute_split_dual_w16(
-    lane0: &mut [u64; 16],
-    lane1: &mut [u64; 16],
-    constants: &[u64],
-) {
-    let mut s0_a = lane0[0];
-    let mut s0_b = lane1[0];
-    for &rc in constants {
-        unsafe {
-            s0_a = add_asm(s0_a, rc);
-            s0_b = add_asm(s0_b, rc);
-
-            let s0_2_a = mul_asm(s0_a, s0_a);
-            let s0_2_b = mul_asm(s0_b, s0_b);
-
-            let sum1_a = add_asm(lane0[1], lane0[2]);
-            let sum1_b = add_asm(lane1[1], lane1[2]);
-            let sum2_a = add_asm(lane0[3], lane0[4]);
-            let sum2_b = add_asm(lane1[3], lane1[4]);
-            let sum3_a = add_asm(lane0[5], lane0[6]);
-            let sum3_b = add_asm(lane1[5], lane1[6]);
-            let sum4_a = add_asm(lane0[7], lane0[8]);
-            let sum4_b = add_asm(lane1[7], lane1[8]);
-            let sum5_a = add_asm(lane0[9], lane0[10]);
-            let sum5_b = add_asm(lane1[9], lane1[10]);
-            let sum6_a = add_asm(lane0[11], lane0[12]);
-            let sum6_b = add_asm(lane1[11], lane1[12]);
-            let sum7_a = add_asm(lane0[13], lane0[14]);
-            let sum7_b = add_asm(lane1[13], lane1[14]);
-
-            let s0_3_a = mul_asm(s0_2_a, s0_a);
-            let s0_3_b = mul_asm(s0_2_b, s0_b);
-            let s0_4_a = mul_asm(s0_2_a, s0_2_a);
-            let s0_4_b = mul_asm(s0_2_b, s0_2_b);
-
-            let sum12_a = add_asm(sum1_a, sum2_a);
-            let sum12_b = add_asm(sum1_b, sum2_b);
-            let sum34_a = add_asm(sum3_a, sum4_a);
-            let sum34_b = add_asm(sum3_b, sum4_b);
-            let sum56_a = add_asm(sum5_a, sum6_a);
-            let sum56_b = add_asm(sum5_b, sum6_b);
-            let sum715_a = add_asm(sum7_a, lane0[15]);
-            let sum715_b = add_asm(sum7_b, lane1[15]);
-
-            let sum1234_a = add_asm(sum12_a, sum34_a);
-            let sum1234_b = add_asm(sum12_b, sum34_b);
-            let sum56715_a = add_asm(sum56_a, sum715_a);
-            let sum56715_b = add_asm(sum56_b, sum715_b);
-            let sum_hi_a = add_asm(sum1234_a, sum56715_a);
-            let sum_hi_b = add_asm(sum1234_b, sum56715_b);
-
-            let d1_a = lane0[1];
-            let d1_b = lane1[1];
-            let d2_a = double_asm(lane0[2]);
-            let d2_b = double_asm(lane1[2]);
-            let d3_a = div2_asm(lane0[3]);
-            let d3_b = div2_asm(lane1[3]);
-            let d4_a = add_asm(double_asm(lane0[4]), lane0[4]);
-            let d4_b = add_asm(double_asm(lane1[4]), lane1[4]);
-            let d5_a = double_asm(double_asm(lane0[5]));
-            let d5_b = double_asm(double_asm(lane1[5]));
-            let d6_a = div2_asm(lane0[6]);
-            let d6_b = div2_asm(lane1[6]);
-            let d7_a = add_asm(double_asm(lane0[7]), lane0[7]);
-            let d7_b = add_asm(double_asm(lane1[7]), lane1[7]);
-            let d8_a = double_asm(double_asm(lane0[8]));
-            let d8_b = double_asm(double_asm(lane1[8]));
-
-            let d9_a = div8_asm(lane0[9]);
-            let d9_b = div8_asm(lane1[9]);
-            let d10_a = div16_asm(lane0[10]);
-            let d10_b = div16_asm(lane1[10]);
-            let d11_a = div32_asm(lane0[11]);
-            let d11_b = div32_asm(lane1[11]);
-            let d12_a = div8_asm(lane0[12]);
-            let d12_b = div8_asm(lane1[12]);
-            let d13_a = div16_asm(lane0[13]);
-            let d13_b = div16_asm(lane1[13]);
-            let d14_a = div32_asm(lane0[14]);
-            let d14_b = div32_asm(lane1[14]);
-            let d15_a = div_2_32_asm(lane0[15]);
-            let d15_b = div_2_32_asm(lane1[15]);
-
-            s0_a = mul_asm(s0_3_a, s0_4_a);
-            s0_b = mul_asm(s0_3_b, s0_4_b);
-
-            let sum_a = add_asm(sum_hi_a, s0_a);
-            let sum_b = add_asm(sum_hi_b, s0_b);
-            s0_a = sub_asm(sum_hi_a, s0_a);
-            s0_b = sub_asm(sum_hi_b, s0_b);
-
-            lane0[1] = add_asm(d1_a, sum_a);
-            lane1[1] = add_asm(d1_b, sum_b);
-            lane0[2] = add_asm(d2_a, sum_a);
-            lane1[2] = add_asm(d2_b, sum_b);
-            lane0[3] = add_asm(d3_a, sum_a);
-            lane1[3] = add_asm(d3_b, sum_b);
-            lane0[4] = add_asm(d4_a, sum_a);
-            lane1[4] = add_asm(d4_b, sum_b);
-            lane0[5] = add_asm(d5_a, sum_a);
-            lane1[5] = add_asm(d5_b, sum_b);
-            lane0[6] = sub_asm(sum_a, d6_a);
-            lane1[6] = sub_asm(sum_b, d6_b);
-            lane0[7] = sub_asm(sum_a, d7_a);
-            lane1[7] = sub_asm(sum_b, d7_b);
-            lane0[8] = sub_asm(sum_a, d8_a);
-            lane1[8] = sub_asm(sum_b, d8_b);
-            lane0[9] = add_asm(d9_a, sum_a);
-            lane1[9] = add_asm(d9_b, sum_b);
-            lane0[10] = add_asm(d10_a, sum_a);
-            lane1[10] = add_asm(d10_b, sum_b);
-            lane0[11] = add_asm(d11_a, sum_a);
-            lane1[11] = add_asm(d11_b, sum_b);
-            lane0[12] = sub_asm(sum_a, d12_a);
-            lane1[12] = sub_asm(sum_b, d12_b);
-            lane0[13] = sub_asm(sum_a, d13_a);
-            lane1[13] = sub_asm(sum_b, d13_b);
-            lane0[14] = sub_asm(sum_a, d14_a);
-            lane1[14] = sub_asm(sum_b, d14_b);
-            lane0[15] = add_asm(d15_a, sum_a);
-            lane1[15] = add_asm(d15_b, sum_b);
-        }
-    }
-    lane0[0] = s0_a;
-    lane1[0] = s0_b;
-}
-
-// External layer: S-box on all elements, then MDS. Pipelined for latency hiding.
-
-/// Double a Goldilocks element.
-#[inline(always)]
-unsafe fn double_asm(a: u64) -> u64 {
-    // SAFETY: add_asm is safe with valid Goldilocks field elements
-    unsafe { add_asm(a, a) }
-}
-
-/// 4x4 circulant MDS with coefficients [2,3,1,1].
-#[inline(always)]
-unsafe fn apply_mat4_asm(x: &mut [u64; 4]) {
-    unsafe {
-        let t01 = add_asm(x[0], x[1]);
-        let t23 = add_asm(x[2], x[3]);
-        let t0123 = add_asm(t01, t23);
-        let t01123 = add_asm(t0123, x[1]);
-        let t01233 = add_asm(t0123, x[3]);
-
-        let y3 = add_asm(t01233, double_asm(x[0]));
-        let y1 = add_asm(t01123, double_asm(x[2]));
-        let y0 = add_asm(t01123, t01);
-        let y2 = add_asm(t01233, t23);
-
-        x[0] = y0;
-        x[1] = y1;
-        x[2] = y2;
-        x[3] = y3;
-    }
-}
-
-/// Poseidon2 MDS light permutation: 4x4 blocks + outer sums.
-#[inline(always)]
-pub unsafe fn mds_light_permutation_asm<const WIDTH: usize>(state: &mut [u64; WIDTH]) {
-    unsafe {
-        // Apply M_4 to each consecutive four elements
-        let mut i = 0;
-        while i < WIDTH {
-            let chunk: &mut [u64; 4] = (&mut state[i..i + 4]).try_into().unwrap();
-            apply_mat4_asm(chunk);
-            i += 4;
-        }
-
-        // Compute the four sums of every 4th element
-        let mut sums = [0u64; 4];
-        for j in (0..WIDTH).step_by(4) {
-            sums[0] = add_asm(sums[0], state[j]);
-            sums[1] = add_asm(sums[1], state[j + 1]);
-            sums[2] = add_asm(sums[2], state[j + 2]);
-            sums[3] = add_asm(sums[3], state[j + 3]);
-        }
-
-        // Add sums back to state
-        for (i, elem) in state.iter_mut().enumerate() {
-            *elem = add_asm(*elem, sums[i % 4]);
-        }
-    }
-}
-
-/// Pipelined S-box computation for all elements.
-/// Computes x^7 for all elements by interleaving stages to hide latency.
-#[inline(always)]
-pub unsafe fn sbox_layer_asm<const WIDTH: usize>(state: &mut [u64; WIDTH]) {
-    unsafe {
-        // Stage 1: Compute all x^2 values
-        let mut x2 = [0u64; WIDTH];
-        for i in 0..WIDTH {
-            x2[i] = mul_asm(state[i], state[i]);
-        }
-
-        // Stage 2: Compute x^3 and x^4 values interleaved
-        // x^3 = x^2 * x, x^4 = x^2 * x^2
-        let mut x3 = [0u64; WIDTH];
-        let mut x4 = [0u64; WIDTH];
-        for i in 0..WIDTH {
-            x3[i] = mul_asm(x2[i], state[i]);
-            x4[i] = mul_asm(x2[i], x2[i]);
-        }
-
-        // Stage 3: Compute x^7 = x^3 * x^4
-        for i in 0..WIDTH {
-            state[i] = mul_asm(x3[i], x4[i]);
-        }
-    }
-}
-
-/// Optimized external round: add RC, S-box, MDS.
-#[inline(always)]
-pub unsafe fn external_round_asm<const WIDTH: usize>(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) {
-    unsafe {
-        // Add round constants
-        for i in 0..WIDTH {
-            state[i] = add_asm(state[i], rc[i]);
-        }
-
-        // Apply S-box (x^7) to all elements
-        sbox_layer_asm(state);
-
-        // Apply MDS light permutation
-        mds_light_permutation_asm(state);
-    }
-}
-
-/// Interleaved dual-lane S-box layer for better ILP.
-#[inline(always)]
-pub unsafe fn sbox_layer_dual_asm<const WIDTH: usize>(
-    state0: &mut [u64; WIDTH],
-    state1: &mut [u64; WIDTH],
-) {
-    unsafe {
-        // Stage 1: Compute all x^2 values for both lanes (interleaved)
-        let mut x2_a = [0u64; WIDTH];
-        let mut x2_b = [0u64; WIDTH];
-        for i in 0..WIDTH {
-            x2_a[i] = mul_asm(state0[i], state0[i]);
-            x2_b[i] = mul_asm(state1[i], state1[i]);
-        }
-
-        // Stage 2: Compute x^3 and x^4 for both lanes (interleaved)
-        let mut x3_a = [0u64; WIDTH];
-        let mut x3_b = [0u64; WIDTH];
-        let mut x4_a = [0u64; WIDTH];
-        let mut x4_b = [0u64; WIDTH];
-        for i in 0..WIDTH {
-            x3_a[i] = mul_asm(x2_a[i], state0[i]);
-            x3_b[i] = mul_asm(x2_b[i], state1[i]);
-            x4_a[i] = mul_asm(x2_a[i], x2_a[i]);
-            x4_b[i] = mul_asm(x2_b[i], x2_b[i]);
-        }
-
-        // Stage 3: Compute x^7 = x^3 * x^4 for both lanes
-        for i in 0..WIDTH {
-            state0[i] = mul_asm(x3_a[i], x4_a[i]);
-            state1[i] = mul_asm(x3_b[i], x4_b[i]);
-        }
-    }
-}
-
-/// Interleaved dual-lane external round for better ILP.
-#[inline(always)]
-pub unsafe fn external_round_dual_asm<const WIDTH: usize>(
-    state0: &mut [u64; WIDTH],
-    state1: &mut [u64; WIDTH],
-    rc: &[u64; WIDTH],
-) {
-    unsafe {
-        // Add round constants (interleaved)
-        for i in 0..WIDTH {
-            state0[i] = add_asm(state0[i], rc[i]);
-            state1[i] = add_asm(state1[i], rc[i]);
-        }
-
-        // Apply S-box (interleaved dual-lane)
-        sbox_layer_dual_asm(state0, state1);
-
-        // Apply MDS (sequential - MDS is mostly additions which are fast)
-        mds_light_permutation_asm(state0);
-        mds_light_permutation_asm(state1);
-    }
-}
-
-/// Fully unrolled and fused external round for W8.
-#[inline(always)]
-pub unsafe fn external_round_fused_w8(state: &mut [u64; 8], rc: &[u64; 8]) {
-    unsafe {
-        let s0 = add_asm(state[0], rc[0]);
-        let s1 = add_asm(state[1], rc[1]);
-        let x2_0 = mul_asm(s0, s0);
-        let x2_1 = mul_asm(s1, s1);
-
-        let s2 = add_asm(state[2], rc[2]);
-        let s3 = add_asm(state[3], rc[3]);
-        let x2_2 = mul_asm(s2, s2);
-        let x2_3 = mul_asm(s3, s3);
-
-        let s4 = add_asm(state[4], rc[4]);
-        let s5 = add_asm(state[5], rc[5]);
-        let x2_4 = mul_asm(s4, s4);
-        let x2_5 = mul_asm(s5, s5);
-
-        let s6 = add_asm(state[6], rc[6]);
-        let s7 = add_asm(state[7], rc[7]);
-        let x2_6 = mul_asm(s6, s6);
-        let x2_7 = mul_asm(s7, s7);
-
-        let x3_0 = mul_asm(x2_0, s0);
-        let x3_1 = mul_asm(x2_1, s1);
-        let x4_0 = mul_asm(x2_0, x2_0);
-        let x4_1 = mul_asm(x2_1, x2_1);
-        let x3_2 = mul_asm(x2_2, s2);
-        let x3_3 = mul_asm(x2_3, s3);
-        let x4_2 = mul_asm(x2_2, x2_2);
-        let x4_3 = mul_asm(x2_3, x2_3);
-        let x3_4 = mul_asm(x2_4, s4);
-        let x3_5 = mul_asm(x2_5, s5);
-        let x4_4 = mul_asm(x2_4, x2_4);
-        let x4_5 = mul_asm(x2_5, x2_5);
-        let x3_6 = mul_asm(x2_6, s6);
-        let x3_7 = mul_asm(x2_7, s7);
-        let x4_6 = mul_asm(x2_6, x2_6);
-        let x4_7 = mul_asm(x2_7, x2_7);
-
-        state[0] = mul_asm(x3_0, x4_0);
-        state[1] = mul_asm(x3_1, x4_1);
-        state[2] = mul_asm(x3_2, x4_2);
-        state[3] = mul_asm(x3_3, x4_3);
-        state[4] = mul_asm(x3_4, x4_4);
-        state[5] = mul_asm(x3_5, x4_5);
-        state[6] = mul_asm(x3_6, x4_6);
-        state[7] = mul_asm(x3_7, x4_7);
-
-        mds_light_permutation_asm(state);
-    }
-}
-
-/// Fully unrolled and fused dual-lane external round for W8.
-#[inline(always)]
-pub unsafe fn external_round_fused_dual_w8(
-    state0: &mut [u64; 8],
-    state1: &mut [u64; 8],
-    rc: &[u64; 8],
-) {
-    unsafe {
-        // Half 1: elements 0-3 across both lanes
-        let s0_a = add_asm(state0[0], rc[0]);
-        let s0_b = add_asm(state1[0], rc[0]);
-        let s1_a = add_asm(state0[1], rc[1]);
-        let s1_b = add_asm(state1[1], rc[1]);
-        let s2_a = add_asm(state0[2], rc[2]);
-        let s2_b = add_asm(state1[2], rc[2]);
-        let s3_a = add_asm(state0[3], rc[3]);
-        let s3_b = add_asm(state1[3], rc[3]);
-
-        let x2_0a = mul_asm(s0_a, s0_a);
-        let x2_0b = mul_asm(s0_b, s0_b);
-        let x2_1a = mul_asm(s1_a, s1_a);
-        let x2_1b = mul_asm(s1_b, s1_b);
-        let x2_2a = mul_asm(s2_a, s2_a);
-        let x2_2b = mul_asm(s2_b, s2_b);
-        let x2_3a = mul_asm(s3_a, s3_a);
-        let x2_3b = mul_asm(s3_b, s3_b);
-
-        let x3_0a = mul_asm(x2_0a, s0_a);
-        let x3_0b = mul_asm(x2_0b, s0_b);
-        let x4_0a = mul_asm(x2_0a, x2_0a);
-        let x4_0b = mul_asm(x2_0b, x2_0b);
-        let x3_1a = mul_asm(x2_1a, s1_a);
-        let x3_1b = mul_asm(x2_1b, s1_b);
-        let x4_1a = mul_asm(x2_1a, x2_1a);
-        let x4_1b = mul_asm(x2_1b, x2_1b);
-        let x3_2a = mul_asm(x2_2a, s2_a);
-        let x3_2b = mul_asm(x2_2b, s2_b);
-        let x4_2a = mul_asm(x2_2a, x2_2a);
-        let x4_2b = mul_asm(x2_2b, x2_2b);
-        let x3_3a = mul_asm(x2_3a, s3_a);
-        let x3_3b = mul_asm(x2_3b, s3_b);
-        let x4_3a = mul_asm(x2_3a, x2_3a);
-        let x4_3b = mul_asm(x2_3b, x2_3b);
-
-        state0[0] = mul_asm(x3_0a, x4_0a);
-        state1[0] = mul_asm(x3_0b, x4_0b);
-        state0[1] = mul_asm(x3_1a, x4_1a);
-        state1[1] = mul_asm(x3_1b, x4_1b);
-        state0[2] = mul_asm(x3_2a, x4_2a);
-        state1[2] = mul_asm(x3_2b, x4_2b);
-        state0[3] = mul_asm(x3_3a, x4_3a);
-        state1[3] = mul_asm(x3_3b, x4_3b);
-
-        // Half 2: elements 4-7 across both lanes
-        let s4_a = add_asm(state0[4], rc[4]);
-        let s4_b = add_asm(state1[4], rc[4]);
-        let s5_a = add_asm(state0[5], rc[5]);
-        let s5_b = add_asm(state1[5], rc[5]);
-        let s6_a = add_asm(state0[6], rc[6]);
-        let s6_b = add_asm(state1[6], rc[6]);
-        let s7_a = add_asm(state0[7], rc[7]);
-        let s7_b = add_asm(state1[7], rc[7]);
-
-        let x2_4a = mul_asm(s4_a, s4_a);
-        let x2_4b = mul_asm(s4_b, s4_b);
-        let x2_5a = mul_asm(s5_a, s5_a);
-        let x2_5b = mul_asm(s5_b, s5_b);
-        let x2_6a = mul_asm(s6_a, s6_a);
-        let x2_6b = mul_asm(s6_b, s6_b);
-        let x2_7a = mul_asm(s7_a, s7_a);
-        let x2_7b = mul_asm(s7_b, s7_b);
-
-        let x3_4a = mul_asm(x2_4a, s4_a);
-        let x3_4b = mul_asm(x2_4b, s4_b);
-        let x4_4a = mul_asm(x2_4a, x2_4a);
-        let x4_4b = mul_asm(x2_4b, x2_4b);
-        let x3_5a = mul_asm(x2_5a, s5_a);
-        let x3_5b = mul_asm(x2_5b, s5_b);
-        let x4_5a = mul_asm(x2_5a, x2_5a);
-        let x4_5b = mul_asm(x2_5b, x2_5b);
-        let x3_6a = mul_asm(x2_6a, s6_a);
-        let x3_6b = mul_asm(x2_6b, s6_b);
-        let x4_6a = mul_asm(x2_6a, x2_6a);
-        let x4_6b = mul_asm(x2_6b, x2_6b);
-        let x3_7a = mul_asm(x2_7a, s7_a);
-        let x3_7b = mul_asm(x2_7b, s7_b);
-        let x4_7a = mul_asm(x2_7a, x2_7a);
-        let x4_7b = mul_asm(x2_7b, x2_7b);
-
-        state0[4] = mul_asm(x3_4a, x4_4a);
-        state1[4] = mul_asm(x3_4b, x4_4b);
-        state0[5] = mul_asm(x3_5a, x4_5a);
-        state1[5] = mul_asm(x3_5b, x4_5b);
-        state0[6] = mul_asm(x3_6a, x4_6a);
-        state1[6] = mul_asm(x3_6b, x4_6b);
-        state0[7] = mul_asm(x3_7a, x4_7a);
-        state1[7] = mul_asm(x3_7b, x4_7b);
-
-        mds_light_permutation_asm(state0);
-        mds_light_permutation_asm(state1);
-    }
-}
-
-/// Run initial external rounds with pre-converted raw u64 constants.
-#[inline]
-pub fn external_initial_permute_state_asm<const WIDTH: usize>(
-    state: &mut [u64; WIDTH],
-    initial_constants: &[[u64; WIDTH]],
-) {
-    unsafe {
-        mds_light_permutation_asm(state);
-    }
-    for rc in initial_constants {
-        unsafe {
-            external_round_asm(state, rc);
-        }
-    }
-}
-
-/// Run terminal external rounds with pre-converted raw u64 constants.
-#[inline]
-pub fn external_terminal_permute_state_asm<const WIDTH: usize>(
-    state: &mut [u64; WIDTH],
-    terminal_constants: &[[u64; WIDTH]],
-) {
-    for rc in terminal_constants {
-        unsafe {
-            external_round_asm(state, rc);
-        }
-    }
-}
-
-/// W8-specialized initial external permute using fused rounds.
-#[inline]
-pub fn external_initial_permute_w8(state: &mut [u64; 8], initial_constants: &[[u64; 8]]) {
-    unsafe {
-        mds_light_permutation_asm(state);
-    }
-    for rc in initial_constants {
-        unsafe {
-            external_round_fused_w8(state, rc);
-        }
-    }
-}
-
-/// W8-specialized terminal external permute using fused rounds.
-#[inline]
-pub fn external_terminal_permute_w8(state: &mut [u64; 8], terminal_constants: &[[u64; 8]]) {
-    for rc in terminal_constants {
-        unsafe {
-            external_round_fused_w8(state, rc);
-        }
-    }
-}
-
-/// Dual-lane initial external permute with pre-converted constants.
-#[inline]
-pub fn external_initial_permute_dual<const WIDTH: usize>(
-    lane0: &mut [u64; WIDTH],
-    lane1: &mut [u64; WIDTH],
-    constants: &[[u64; WIDTH]],
-) {
-    unsafe {
-        mds_light_permutation_asm(lane0);
-        mds_light_permutation_asm(lane1);
-    }
-    for rc in constants {
-        unsafe {
-            external_round_dual_asm(lane0, lane1, rc);
-        }
-    }
-}
-
-/// Dual-lane terminal external permute with pre-converted constants.
-#[inline]
-pub fn external_terminal_permute_dual<const WIDTH: usize>(
-    lane0: &mut [u64; WIDTH],
-    lane1: &mut [u64; WIDTH],
-    constants: &[[u64; WIDTH]],
-) {
-    for rc in constants {
-        unsafe {
-            external_round_dual_asm(lane0, lane1, rc);
-        }
-    }
-}
-
-/// W8-specialized dual-lane initial external permute using fused rounds.
-#[inline]
-pub fn external_initial_permute_dual_w8(
-    lane0: &mut [u64; 8],
-    lane1: &mut [u64; 8],
-    constants: &[[u64; 8]],
-) {
-    unsafe {
-        mds_light_permutation_asm(lane0);
-        mds_light_permutation_asm(lane1);
-    }
-    for rc in constants {
-        unsafe {
-            external_round_fused_dual_w8(lane0, lane1, rc);
-        }
-    }
-}
-
-/// W8-specialized dual-lane terminal external permute using fused rounds.
-#[inline]
-pub fn external_terminal_permute_dual_w8(
-    lane0: &mut [u64; 8],
-    lane1: &mut [u64; 8],
-    constants: &[[u64; 8]],
-) {
-    for rc in constants {
-        unsafe {
-            external_round_fused_dual_w8(lane0, lane1, rc);
-        }
-    }
-}
-
-// NEON 2-wide Goldilocks field primitives.
-// Each operates on both packed lanes simultaneously using uint64x2_t.
-
-#[inline(always)]
-unsafe fn add_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let res = vaddq_u64(a, b);
-        let overflow = vcgtq_u64(a, res);
-        let adj = vshrq_n_u64::<32>(overflow);
-        vaddq_u64(res, adj)
-    }
-}
-
-#[inline(always)]
-unsafe fn sub_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let res = vsubq_u64(a, b);
-        let underflow = vcgtq_u64(b, a);
-        let adj = vshrq_n_u64::<32>(underflow);
-        vsubq_u64(res, adj)
-    }
-}
-
-#[inline(always)]
-unsafe fn double_neon(a: uint64x2_t) -> uint64x2_t {
-    unsafe { add_neon(a, a) }
-}
-
-#[inline(always)]
-unsafe fn div2_neon(x: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let half_p_plus_1 = vdupq_n_u64((P + 1) >> 1);
-        let one = vdupq_n_u64(1);
-        let is_odd = vandq_u64(x, one);
-        let half = vshrq_n_u64::<1>(x);
-        let mask = vtstq_u64(is_odd, is_odd);
-        let adj = vandq_u64(mask, half_p_plus_1);
-        vaddq_u64(half, adj)
-    }
-}
-
-#[inline(always)]
-unsafe fn div4_neon(x: uint64x2_t) -> uint64x2_t {
-    unsafe { div2_neon(div2_neon(x)) }
-}
-
-#[inline(always)]
-unsafe fn div8_neon(x: uint64x2_t) -> uint64x2_t {
-    unsafe { div2_neon(div4_neon(x)) }
-}
-
-#[inline(always)]
-unsafe fn div16_neon(x: uint64x2_t) -> uint64x2_t {
-    unsafe { div2_neon(div8_neon(x)) }
-}
-
-#[inline(always)]
-unsafe fn div32_neon(x: uint64x2_t) -> uint64x2_t {
-    unsafe { div4_neon(div8_neon(x)) }
-}
-
-/// Compute x * 2^{-32} mod P for each lane using Goldilocks structure.
-///
-/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P).
-/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P.
-#[inline(always)]
-unsafe fn div_2_32_neon(x: uint64x2_t) -> uint64x2_t {
-    unsafe {
-        let mask_32 = vdupq_n_u64(0xFFFFFFFF);
-        let hi = vshrq_n_u64::<32>(x);
-        let lo = vandq_u64(x, mask_32);
-        let sum = vaddq_u64(hi, lo);
-        let t = vshlq_n_u64::<32>(lo);
-        sub_neon(sum, t)
-    }
-}
-
-#[inline(always)]
-unsafe fn apply_mat4_neon(x: &mut [uint64x2_t; 4]) {
-    unsafe {
-        let t01 = add_neon(x[0], x[1]);
-        let t23 = add_neon(x[2], x[3]);
-        let t0123 = add_neon(t01, t23);
-        let t01123 = add_neon(t0123, x[1]);
-        let t01233 = add_neon(t0123, x[3]);
-        x[3] = add_neon(t01233, double_neon(x[0]));
-        x[1] = add_neon(t01123, double_neon(x[2]));
-        x[0] = add_neon(t01123, t01);
-        x[2] = add_neon(t01233, t23);
-    }
-}
-
-#[inline(always)]
-unsafe fn mds_light_neon<const WIDTH: usize>(state: &mut [uint64x2_t; WIDTH]) {
-    unsafe {
-        let mut i = 0;
-        while i < WIDTH {
-            let chunk: &mut [uint64x2_t; 4] = (&mut state[i..i + 4]).try_into().unwrap();
-            apply_mat4_neon(chunk);
-            i += 4;
-        }
-        let zero = vdupq_n_u64(0);
-        let mut sums = [zero; 4];
-        for j in (0..WIDTH).step_by(4) {
-            sums[0] = add_neon(sums[0], state[j]);
-            sums[1] = add_neon(sums[1], state[j + 1]);
-            sums[2] = add_neon(sums[2], state[j + 2]);
-            sums[3] = add_neon(sums[3], state[j + 3]);
-        }
-        for (i, elem) in state.iter_mut().enumerate() {
-            *elem = add_neon(*elem, sums[i % 4]);
-        }
-    }
-}
-
-/// Convert separate lane arrays into NEON vector array.
-#[inline]
-pub fn lanes_to_neon<const WIDTH: usize>(
-    lane0: &[u64; WIDTH],
-    lane1: &[u64; WIDTH],
-) -> [uint64x2_t; WIDTH] {
-    core::array::from_fn(|i| unsafe {
-        let lo = vcreate_u64(lane0[i]);
-        let hi = vcreate_u64(lane1[i]);
-        vcombine_u64(lo, hi)
-    })
-}
-
-/// Convert NEON vector array back to separate lane arrays.
-#[inline]
-pub fn neon_to_lanes<const WIDTH: usize>(
-    state_v: &[uint64x2_t; WIDTH],
-    lane0: &mut [u64; WIDTH],
-    lane1: &mut [u64; WIDTH],
-) {
-    for i in 0..WIDTH {
-        unsafe {
-            lane0[i] = vgetq_lane_u64::<0>(state_v[i]);
-            lane1[i] = vgetq_lane_u64::<1>(state_v[i]);
-        }
-    }
-}
-
-// NEON-based internal permutation: both packed lanes processed
-// simultaneously via uint64x2_t for sum tree, diagonal, and writeback.
-
-#[inline]
-pub fn internal_permute_neon_w12(state: &mut [uint64x2_t; 12], constants: &[u64]) {
-    let mut s0 = state[0];
-    for &rc in constants {
-        unsafe {
-            let rc_vec = vdupq_n_u64(rc);
-            s0 = add_neon(s0, rc_vec);
-
-            let s0_0 = vgetq_lane_u64::<0>(s0);
-            let s0_1 = vgetq_lane_u64::<1>(s0);
-            let s0_2_0 = mul_asm(s0_0, s0_0);
-            let s0_2_1 = mul_asm(s0_1, s0_1);
-
-            let sum1 = add_neon(state[1], state[2]);
-            let sum2 = add_neon(state[3], state[4]);
-            let sum3 = add_neon(state[5], state[6]);
-            let sum4 = add_neon(state[7], state[8]);
-            let sum5 = add_neon(state[9], state[10]);
-
-            let s0_3_0 = mul_asm(s0_2_0, s0_0);
-            let s0_3_1 = mul_asm(s0_2_1, s0_1);
-            let s0_4_0 = mul_asm(s0_2_0, s0_2_0);
-            let s0_4_1 = mul_asm(s0_2_1, s0_2_1);
-
-            let sum12 = add_neon(sum1, sum2);
-            let sum34 = add_neon(sum3, sum4);
-            let sum511 = add_neon(sum5, state[11]);
-
-            let d1 = state[1];
-            let d2 = double_neon(state[2]);
-            let d3 = div2_neon(state[3]);
-            let d4 = add_neon(double_neon(state[4]), state[4]);
-
-            let sum1234 = add_neon(sum12, sum34);
-
-            let d5 = double_neon(double_neon(state[5]));
-            let d6 = div2_neon(state[6]);
-            let d7 = add_neon(double_neon(state[7]), state[7]);
-            let d8 = double_neon(double_neon(state[8]));
-
-            let sum_hi = add_neon(sum1234, sum511);
-
-            let d9 = div4_neon(state[9]);
-            let d10 = div4_neon(state[10]);
-            let d11 = div8_neon(state[11]);
-
-            let s0_7_0 = mul_asm(s0_3_0, s0_4_0);
-            let s0_7_1 = mul_asm(s0_3_1, s0_4_1);
-            let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1));
-
-            let sum = add_neon(sum_hi, s0_7);
-            s0 = sub_neon(sum_hi, s0_7);
-
-            state[1] = add_neon(d1, sum);
-            state[2] = add_neon(d2, sum);
-            state[3] = add_neon(d3, sum);
-            state[4] = add_neon(d4, sum);
-            state[5] = add_neon(d5, sum);
-            state[6] = sub_neon(sum, d6);
-            state[7] = sub_neon(sum, d7);
-            state[8] = sub_neon(sum, d8);
-            state[9] = add_neon(d9, sum);
-            state[10] = sub_neon(sum, d10);
-            state[11] = add_neon(d11, sum);
-        }
-    }
-    state[0] = s0;
-}
-
-#[inline]
-pub fn internal_permute_neon_w16(state: &mut [uint64x2_t; 16], constants: &[u64]) {
-    let mut s0 = state[0];
-    for &rc in constants {
-        unsafe {
-            let rc_vec = vdupq_n_u64(rc);
-            s0 = add_neon(s0, rc_vec);
-
-            let s0_0 = vgetq_lane_u64::<0>(s0);
-            let s0_1 = vgetq_lane_u64::<1>(s0);
-            let s0_2_0 = mul_asm(s0_0, s0_0);
-            let s0_2_1 = mul_asm(s0_1, s0_1);
-
-            let sum1 = add_neon(state[1], state[2]);
-            let sum2 = add_neon(state[3], state[4]);
-            let sum3 = add_neon(state[5], state[6]);
-            let sum4 = add_neon(state[7], state[8]);
-            let sum5 = add_neon(state[9], state[10]);
-            let sum6 = add_neon(state[11], state[12]);
-            let sum7 = add_neon(state[13], state[14]);
-
-            let s0_3_0 = mul_asm(s0_2_0, s0_0);
-            let s0_3_1 = mul_asm(s0_2_1, s0_1);
-            let s0_4_0 = mul_asm(s0_2_0, s0_2_0);
-            let s0_4_1 = mul_asm(s0_2_1, s0_2_1);
-
-            let sum12 = add_neon(sum1, sum2);
-            let sum34 = add_neon(sum3, sum4);
-            let sum56 = add_neon(sum5, sum6);
-            let sum715 = add_neon(sum7, state[15]);
-
-            let sum1234 = add_neon(sum12, sum34);
-            let sum56715 = add_neon(sum56, sum715);
-            let sum_hi = add_neon(sum1234, sum56715);
-
-            let d1 = state[1];
-            let d2 = double_neon(state[2]);
-            let d3 = div2_neon(state[3]);
-            let d4 = add_neon(double_neon(state[4]), state[4]);
-            let d5 = double_neon(double_neon(state[5]));
-            let d6 = div2_neon(state[6]);
-            let d7 = add_neon(double_neon(state[7]), state[7]);
-            let d8 = double_neon(double_neon(state[8]));
-
-            let d9 = div8_neon(state[9]);
-            let d10 = div16_neon(state[10]);
-            let d11 = div32_neon(state[11]);
-            let d12 = div8_neon(state[12]);
-            let d13 = div16_neon(state[13]);
-            let d14 = div32_neon(state[14]);
-            let d15 = div_2_32_neon(state[15]);
-
-            let s0_7_0 = mul_asm(s0_3_0, s0_4_0);
-            let s0_7_1 = mul_asm(s0_3_1, s0_4_1);
-            let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1));
-
-            let sum = add_neon(sum_hi, s0_7);
-            s0 = sub_neon(sum_hi, s0_7);
-
-            state[1] = add_neon(d1, sum);
-            state[2] = add_neon(d2, sum);
-            state[3] = add_neon(d3, sum);
-            state[4] = add_neon(d4, sum);
-            state[5] = add_neon(d5, sum);
-            state[6] = sub_neon(sum, d6);
-            state[7] = sub_neon(sum, d7);
-            state[8] = sub_neon(sum, d8);
-            state[9] = add_neon(d9, sum);
-            state[10] = add_neon(d10, sum);
-            state[11] = add_neon(d11, sum);
-            state[12] = sub_neon(sum, d12);
-            state[13] = sub_neon(sum, d13);
-            state[14] = sub_neon(sum, d14);
-            state[15] = add_neon(d15, sum);
-        }
-    }
-    state[0] = s0;
-}
-
-#[inline]
-pub fn internal_permute_neon<const WIDTH: usize>(
-    state: &mut [uint64x2_t; WIDTH],
-    diag: &[u64; WIDTH],
-    constants: &[u64],
-) {
-    let mut s0 = state[0];
-    for &rc in constants {
-        unsafe {
-            let rc_vec = vdupq_n_u64(rc);
-            s0 = add_neon(s0, rc_vec);
-
-            let s0_0 = vgetq_lane_u64::<0>(s0);
-            let s0_1 = vgetq_lane_u64::<1>(s0);
-            let s0_2_0 = mul_asm(s0_0, s0_0);
-            let s0_2_1 = mul_asm(s0_1, s0_1);
-            let s0_3_0 = mul_asm(s0_2_0, s0_0);
-            let s0_3_1 = mul_asm(s0_2_1, s0_1);
-            let s0_4_0 = mul_asm(s0_2_0, s0_2_0);
-            let s0_4_1 = mul_asm(s0_2_1, s0_2_1);
-            let s0_7_0 = mul_asm(s0_3_0, s0_4_0);
-            let s0_7_1 = mul_asm(s0_3_1, s0_4_1);
-            let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1));
-
-            let zero = vdupq_n_u64(0);
-            let mut sum_hi = zero;
-            for &s in state.iter().skip(1) {
-                sum_hi = add_neon(sum_hi, s);
-            }
-
-            let sum = add_neon(sum_hi, s0_7);
-            s0 = vcombine_u64(
-                vcreate_u64(mul_add_asm(s0_7_0, diag[0], vgetq_lane_u64::<0>(sum))),
-                vcreate_u64(mul_add_asm(s0_7_1, diag[0], vgetq_lane_u64::<1>(sum))),
-            );
-
-            for i in 1..WIDTH {
-                let s_0 = mul_add_asm(
-                    vgetq_lane_u64::<0>(state[i]),
-                    diag[i],
-                    vgetq_lane_u64::<0>(sum),
-                );
-                let s_1 = mul_add_asm(
-                    vgetq_lane_u64::<1>(state[i]),
-                    diag[i],
-                    vgetq_lane_u64::<1>(sum),
-                );
-                state[i] = vcombine_u64(vcreate_u64(s_0), vcreate_u64(s_1));
-            }
-        }
-    }
-    state[0] = s0;
-}
-
-// NEON-based external round: S-box stays scalar, MDS uses NEON.
-
-#[inline(always)]
-unsafe fn sbox_neon<const WIDTH: usize>(state: &mut [uint64x2_t; WIDTH]) {
-    unsafe {
-        let mut x2_0 = [0u64; WIDTH];
-        let mut x2_1 = [0u64; WIDTH];
-        for i in 0..WIDTH {
-            let a = vgetq_lane_u64::<0>(state[i]);
-            let b = vgetq_lane_u64::<1>(state[i]);
-            x2_0[i] = mul_asm(a, a);
-            x2_1[i] = mul_asm(b, b);
-        }
-        let mut x3_0 = [0u64; WIDTH];
-        let mut x3_1 = [0u64; WIDTH];
-        let mut x4_0 = [0u64; WIDTH];
-        let mut x4_1 = [0u64; WIDTH];
-        for i in 0..WIDTH {
-            let a = vgetq_lane_u64::<0>(state[i]);
-            let b = vgetq_lane_u64::<1>(state[i]);
-            x3_0[i] = mul_asm(x2_0[i], a);
-            x3_1[i] = mul_asm(x2_1[i], b);
-            x4_0[i] = mul_asm(x2_0[i], x2_0[i]);
-            x4_1[i] = mul_asm(x2_1[i], x2_1[i]);
-        }
-        for i in 0..WIDTH {
-            let r0 = mul_asm(x3_0[i], x4_0[i]);
-            let r1 = mul_asm(x3_1[i], x4_1[i]);
-            state[i] = vcombine_u64(vcreate_u64(r0), vcreate_u64(r1));
-        }
-    }
-}
-
-#[inline(always)]
-unsafe fn external_round_neon<const WIDTH: usize>(
-    state: &mut [uint64x2_t; WIDTH],
-    rc: &[u64; WIDTH],
-) {
-    unsafe {
-        for i in 0..WIDTH {
-            let rc_vec = vdupq_n_u64(rc[i]);
-            state[i] = add_neon(state[i], rc_vec);
-        }
-        sbox_neon(state);
-        mds_light_neon(state);
-    }
-}
-
-/// NEON initial external permute.
-#[inline]
-pub fn external_initial_neon<const WIDTH: usize>(
-    state: &mut [uint64x2_t; WIDTH],
-    constants: &[[u64; WIDTH]],
-) {
-    unsafe {
-        mds_light_neon(state);
-    }
-    for rc in constants {
-        unsafe {
-            external_round_neon(state, rc);
-        }
-    }
-}
-
-/// NEON terminal external permute.
-#[inline]
-pub fn external_terminal_neon<const WIDTH: usize>(
-    state: &mut [uint64x2_t; WIDTH],
-    constants: &[[u64; WIDTH]],
-) {
-    for rc in constants {
-        unsafe {
-            external_round_neon(state, rc);
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use alloc::vec::Vec;
-
-    use p3_field::{PrimeCharacteristicRing, PrimeField64};
-    use p3_poseidon2::{MDSMat4, matmul_internal, mds_light_permutation};
-    use proptest::prelude::*;
-    use rand::rngs::SmallRng;
-    use rand::{RngExt, SeedableRng};
-
-    use super::*;
-    use crate::{
-        Goldilocks, MATRIX_DIAG_8_GOLDILOCKS, MATRIX_DIAG_12_GOLDILOCKS, MATRIX_DIAG_16_GOLDILOCKS,
-        MATRIX_DIAG_20_GOLDILOCKS,
-    };
-
-    type F = Goldilocks;
-
-    /// Reduce a raw u64 to its canonical Goldilocks representative.
-    fn canon(x: u64) -> u64 {
-        F::new(x).as_canonical_u64()
-    }
-
-    /// Pack two u64 lanes into a single NEON vector.
-    unsafe fn make_neon(a: u64, b: u64) -> uint64x2_t {
-        unsafe { vcombine_u64(vcreate_u64(a), vcreate_u64(b)) }
-    }
-
-    /// Extract both u64 lanes from a NEON vector.
-    unsafe fn read_neon(v: uint64x2_t) -> (u64, u64) {
-        unsafe { (vgetq_lane_u64::<0>(v), vgetq_lane_u64::<1>(v)) }
-    }
-
-    proptest! {
-        #[test]
-        fn test_sub_asm(a: u64, b: u64) {
-            // Compute a - b using the standard field implementation.
-            let expected = (F::new(a) - F::new(b)).as_canonical_u64();
-
-            // The ASM version should give the same canonical result.
-            let got = canon(unsafe { sub_asm(a, b) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_double_asm(a: u64) {
-            // Doubling is just a + a in the field.
-            let expected = (F::new(a) + F::new(a)).as_canonical_u64();
-
-            // The ASM shortcut should match.
-            let got = canon(unsafe { double_asm(a) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_div2_asm(x: u64) {
-            // Dividing by 2 is one halving in the field.
-            let expected = F::new(x).halve().as_canonical_u64();
-
-            let got = canon(unsafe { div2_asm(x) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_div4_asm(x: u64) {
-            // Dividing by 4 is two halvings.
-            let expected = F::new(x).halve().halve().as_canonical_u64();
-
-            let got = canon(unsafe { div4_asm(x) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_div8_asm(x: u64) {
-            // Dividing by 8 is three halvings.
-            let expected = F::new(x).halve().halve().halve().as_canonical_u64();
-
-            let got = canon(unsafe { div8_asm(x) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_div16_asm(x: u64) {
-            // Dividing by 16 is four halvings.
-            let expected = F::new(x).halve().halve().halve().halve().as_canonical_u64();
-
-            let got = canon(unsafe { div16_asm(x) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_div32_asm(x: u64) {
-            // Dividing by 32 is five halvings.
-            let expected = F::new(x)
-                .halve().halve().halve().halve().halve()
-                .as_canonical_u64();
-
-            let got = canon(unsafe { div32_asm(x) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_div_2_32_asm(x: u64) {
-            // Dividing by 2^32: apply halve 32 times as reference.
-            let mut v = F::new(x);
-            for _ in 0..32 {
-                v = v.halve();
-            }
-            let expected = v.as_canonical_u64();
-
-            let got = canon(unsafe { div_2_32_asm(x) });
-            prop_assert_eq!(got, expected);
-        }
-
-        #[test]
-        fn test_apply_mat4_asm(x0: u64, x1: u64, x2: u64, x3: u64) {
-            // Build field elements from the raw inputs.
-            let f = [F::new(x0), F::new(x1), F::new(x2), F::new(x3)];
-
-            // The [2,3,1,1] circulant matrix rows.
-            let two = F::TWO;
-            let three = two + F::ONE;
-            let e0 = two * f[0] + three * f[1] + f[2] + f[3];
-            let e1 = f[0] + two * f[1] + three * f[2] + f[3];
-            let e2 = f[0] + f[1] + two * f[2] + three * f[3];
-            let e3 = three * f[0] + f[1] + f[2] + two * f[3];
-
-            // Run the ASM version on raw u64s.
-            let mut state = [x0, x1, x2, x3];
-            unsafe { apply_mat4_asm(&mut state); }
-
-            // Each slot must match the field-level reference.
-            prop_assert_eq!(canon(state[0]), e0.as_canonical_u64());
-            prop_assert_eq!(canon(state[1]), e1.as_canonical_u64());
-            prop_assert_eq!(canon(state[2]), e2.as_canonical_u64());
-            prop_assert_eq!(canon(state[3]), e3.as_canonical_u64());
-        }
-
-        #[test]
-        fn test_mds_light_permutation_asm_w8(vals in prop::array::uniform8(any::<u64>())) {
-            // Build field-level state and apply the generic MDS.
-            let mut state_generic: [F; 8] = vals.map(F::new);
-            mds_light_permutation(&mut state_generic, &MDSMat4);
-
-            // Run the ASM version on the same raw values.
-            let mut state_asm = vals;
-            unsafe { mds_light_permutation_asm(&mut state_asm); }
-
-            // Every element must agree.
-            for i in 0..8 {
-                prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64());
-            }
-        }
-
-        #[test]
-        fn test_mds_light_permutation_asm_w12(vals in prop::array::uniform12(any::<u64>())) {
-            let mut state_generic: [F; 12] = vals.map(F::new);
-            mds_light_permutation(&mut state_generic, &MDSMat4);
-
-            let mut state_asm = vals;
-            unsafe { mds_light_permutation_asm(&mut state_asm); }
-
-            for i in 0..12 {
-                prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64());
-            }
-        }
-
-        #[test]
-        fn test_mds_light_permutation_asm_w16(vals in prop::array::uniform16(any::<u64>())) {
-            let mut state_generic: [F; 16] = vals.map(F::new);
-            mds_light_permutation(&mut state_generic, &MDSMat4);
-
-            let mut state_asm = vals;
-            unsafe { mds_light_permutation_asm(&mut state_asm); }
-
-            for i in 0..16 {
-                prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64());
-            }
-        }
-
-        #[test]
-        fn test_sbox_layer_asm(vals in prop::array::uniform8(any::<u64>())) {
-            // Apply the ASM S-box to a copy of the input.
-            let mut state = vals;
-            unsafe { sbox_layer_asm(&mut state); }
-
-            // Verify each element is x^7 = x^3 * x^4.
-            for i in 0..8 {
-                let x = F::new(vals[i]);
-                let x2 = x * x;
-                let x3 = x2 * x;
-                let x4 = x2 * x2;
-                let x7 = x3 * x4;
-                prop_assert_eq!(canon(state[i]), x7.as_canonical_u64());
-            }
-        }
-
-        #[test]
-        fn test_external_round_asm(
-            vals in prop::array::uniform8(any::<u64>()),
-            rc in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Build reference: add round constants, apply x^7, then MDS.
-            let mut expected: [F; 8] = core::array::from_fn(|i| F::new(vals[i]) + F::new(rc[i]));
-            for x in expected.iter_mut() {
-                let x2 = *x * *x;
-                let x3 = x2 * *x;
-                let x4 = x2 * x2;
-                *x = x3 * x4;
-            }
-            mds_light_permutation(&mut expected, &MDSMat4);
-
-            // Run the ASM external round.
-            let mut state = vals;
-            unsafe { external_round_asm(&mut state, &rc); }
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64());
-            }
-        }
-
-        #[test]
-        fn test_sbox_layer_dual_asm(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Run sbox on each lane independently as reference.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                sbox_layer_asm(&mut ref0);
-                sbox_layer_asm(&mut ref1);
-            }
-
-            // The dual-lane version processes both at once.
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { sbox_layer_dual_asm(&mut s0, &mut s1); }
-
-            // Both lanes must match their single-lane reference.
-            for i in 0..8 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_round_dual_asm(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-            rc in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Run external round on each lane independently as reference.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                external_round_asm(&mut ref0, &rc);
-                external_round_asm(&mut ref1, &rc);
-            }
-
-            // The dual-lane version processes both at once.
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { external_round_dual_asm(&mut s0, &mut s1, &rc); }
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_round_fused_w8(
-            vals in prop::array::uniform8(any::<u64>()),
-            rc in prop::array::uniform8(any::<u64>()),
-        ) {
-            // The generic external round is the reference.
-            let mut ref_state = vals;
-            unsafe { external_round_asm(&mut ref_state, &rc); }
-
-            // The fused W8 version should produce the same output.
-            let mut fused_state = vals;
-            unsafe { external_round_fused_w8(&mut fused_state, &rc); }
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(fused_state[i]), canon(ref_state[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_round_fused_dual_w8(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-            rc in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Run the fused round on each lane independently as reference.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            unsafe {
-                external_round_fused_w8(&mut ref0, &rc);
-                external_round_fused_w8(&mut ref1, &rc);
-            }
-
-            // The dual version processes both at once.
-            let mut s0 = vals0;
-            let mut s1 = vals1;
-            unsafe { external_round_fused_dual_w8(&mut s0, &mut s1, &rc); }
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(s0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(s1[i]), canon(ref1[i]));
-            }
-        }
-    }
-
-    fn test_internal_round_matches<const WIDTH: usize>(diag: [F; WIDTH]) {
-        let mut rng = SmallRng::seed_from_u64(12345);
-
-        // Build random state and constants.
-        let mut state_asm: [F; WIDTH] = rng.random();
-        let mut state_generic = state_asm;
-
-        let internal_constants: [F; 22] = rng.random();
-        let constants_raw: Vec<u64> = internal_constants.iter().map(|c| c.value).collect();
-        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
-
-        // Run the ASM internal permute on raw u64 representation.
-        let state_raw: &mut [u64; WIDTH] =
-            unsafe { &mut *(&mut state_asm as *mut [F; WIDTH] as *mut [u64; WIDTH]) };
-        internal_permute_state_asm(state_raw, &diag_raw, &constants_raw);
-
-        // Build the same result via field-level ops: add RC, S-box on s0, matmul.
-        for &rc in internal_constants.iter() {
-            state_generic[0] += rc;
-            let s = state_generic[0];
-            let s2 = s * s;
-            let s3 = s2 * s;
-            let s4 = s2 * s2;
-            state_generic[0] = s3 * s4;
-            matmul_internal(&mut state_generic, diag);
-        }
-
-        for i in 0..WIDTH {
-            assert_eq!(
-                state_asm[i].as_canonical_u64(),
-                state_generic[i].as_canonical_u64(),
-                "mismatch at index {i}"
-            );
-        }
-    }
-
-    #[test]
-    fn test_internal_round_width_8() {
-        test_internal_round_matches(MATRIX_DIAG_8_GOLDILOCKS);
-    }
-
-    #[test]
-    fn test_internal_round_width_12() {
-        test_internal_round_matches(MATRIX_DIAG_12_GOLDILOCKS);
-    }
-
-    #[test]
-    fn test_internal_round_width_16() {
-        test_internal_round_matches(MATRIX_DIAG_16_GOLDILOCKS);
-    }
-
-    #[test]
-    fn test_internal_round_width_20() {
-        test_internal_round_matches(MATRIX_DIAG_20_GOLDILOCKS);
-    }
-
-    fn test_specialized_matches_generic<const WIDTH: usize>(
-        diag: [F; WIDTH],
-        specialized_fn: fn(&mut [u64; WIDTH], &[u64]),
-    ) {
-        let mut rng = SmallRng::seed_from_u64(42);
-
-        let internal_constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
-        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
-
-        // Run both the specialized and generic versions on several random states.
-        for _ in 0..8 {
-            let mut state_specialized: [u64; WIDTH] = rng.random();
-            let mut state_generic = state_specialized;
-
-            specialized_fn(&mut state_specialized, &internal_constants);
-            internal_permute_state_asm(&mut state_generic, &diag_raw, &internal_constants);
-
-            for i in 0..WIDTH {
-                assert_eq!(canon(state_specialized[i]), canon(state_generic[i]));
-            }
-        }
-    }
-
-    #[test]
-    fn test_specialized_w8_matches_generic() {
-        test_specialized_matches_generic(MATRIX_DIAG_8_GOLDILOCKS, internal_permute_state_asm_w8);
-    }
-
-    #[test]
-    fn test_specialized_w12_matches_generic() {
-        test_specialized_matches_generic(MATRIX_DIAG_12_GOLDILOCKS, internal_permute_state_asm_w12);
-    }
-
-    #[test]
-    fn test_specialized_w16_matches_generic() {
-        test_specialized_matches_generic(MATRIX_DIAG_16_GOLDILOCKS, internal_permute_state_asm_w16);
-    }
-
-    #[allow(clippy::type_complexity)]
-    fn test_dual_matches_single<const WIDTH: usize>(
-        diag: [F; WIDTH],
-        single_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]),
-        dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64; WIDTH], &[u64]),
-    ) {
-        let mut rng = SmallRng::seed_from_u64(77);
-
-        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
-        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
-
-        // Run single-lane on each lane independently.
-        let mut lane0: [u64; WIDTH] = rng.random();
-        let mut lane1: [u64; WIDTH] = rng.random();
-        let mut ref0 = lane0;
-        let mut ref1 = lane1;
-
-        single_fn(&mut ref0, &diag_raw, &constants);
-        single_fn(&mut ref1, &diag_raw, &constants);
-
-        // Run dual-lane on both at once. Must match.
-        dual_fn(&mut lane0, &mut lane1, &diag_raw, &constants);
-
-        for i in 0..WIDTH {
-            assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}");
-            assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}");
-        }
-    }
-
-    #[test]
-    fn test_internal_permute_split_dual_w8() {
-        test_dual_matches_single(
-            MATRIX_DIAG_8_GOLDILOCKS,
-            internal_permute_state_asm,
-            internal_permute_split_dual,
-        );
-    }
-
-    #[test]
-    fn test_internal_permute_split_dual_w12() {
-        test_dual_matches_single(
-            MATRIX_DIAG_12_GOLDILOCKS,
-            internal_permute_state_asm,
-            internal_permute_split_dual,
-        );
-    }
-
-    #[test]
-    fn test_internal_permute_split_dual_w16() {
-        test_dual_matches_single(
-            MATRIX_DIAG_16_GOLDILOCKS,
-            internal_permute_state_asm,
-            internal_permute_split_dual,
-        );
-    }
-
-    fn test_specialized_dual_matches_generic_dual<const WIDTH: usize>(
-        diag: [F; WIDTH],
-        specialized_dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64]),
-    ) {
-        let mut rng = SmallRng::seed_from_u64(99);
-
-        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
-        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
-
-        // The generic dual-lane version is the reference.
-        let mut lane0: [u64; WIDTH] = rng.random();
-        let mut lane1: [u64; WIDTH] = rng.random();
-        let mut ref0 = lane0;
-        let mut ref1 = lane1;
-
-        internal_permute_split_dual(&mut ref0, &mut ref1, &diag_raw, &constants);
-
-        // The specialized version must match.
-        specialized_dual_fn(&mut lane0, &mut lane1, &constants);
-
-        for i in 0..WIDTH {
-            assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}");
-            assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}");
-        }
-    }
-
-    #[test]
-    fn test_specialized_dual_w8_matches_generic() {
-        test_specialized_dual_matches_generic_dual(
-            MATRIX_DIAG_8_GOLDILOCKS,
-            internal_permute_split_dual_w8,
-        );
-    }
-
-    #[test]
-    fn test_specialized_dual_w12_matches_generic() {
-        test_specialized_dual_matches_generic_dual(
-            MATRIX_DIAG_12_GOLDILOCKS,
-            internal_permute_split_dual_w12,
-        );
-    }
-
-    #[test]
-    fn test_specialized_dual_w16_matches_generic() {
-        test_specialized_dual_matches_generic_dual(
-            MATRIX_DIAG_16_GOLDILOCKS,
-            internal_permute_split_dual_w16,
-        );
-    }
-
-    fn make_round_constants<const WIDTH: usize>(seed: u64, num_rounds: usize) -> Vec<[u64; WIDTH]> {
-        let mut rng = SmallRng::seed_from_u64(seed);
-        (0..num_rounds).map(|_| rng.random()).collect()
-    }
-
-    proptest! {
-        #[test]
-        fn test_external_initial_permute_state_asm(
-            vals in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(42, 4);
-
-            // Reference: apply MDS once, then each external round manually.
-            let mut expected = vals;
-            unsafe { mds_light_permutation_asm(&mut expected); }
-            for rc in &constants {
-                unsafe { external_round_asm(&mut expected, rc); }
-            }
-
-            // The composed function should give the same result.
-            let mut got = vals;
-            external_initial_permute_state_asm(&mut got, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(got[i]), canon(expected[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_terminal_permute_state_asm(
-            vals in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(43, 4);
-
-            // Reference: just the external rounds, no initial MDS.
-            let mut expected = vals;
-            for rc in &constants {
-                unsafe { external_round_asm(&mut expected, rc); }
-            }
-
-            let mut got = vals;
-            external_terminal_permute_state_asm(&mut got, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(got[i]), canon(expected[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_initial_permute_w8(
-            vals in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(44, 4);
-
-            // The generic version is the reference.
-            let mut expected = vals;
-            external_initial_permute_state_asm(&mut expected, &constants);
-
-            // The W8-specialized version must match.
-            let mut got = vals;
-            external_initial_permute_w8(&mut got, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(got[i]), canon(expected[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_terminal_permute_w8(
-            vals in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(45, 4);
-
-            let mut expected = vals;
-            external_terminal_permute_state_asm(&mut expected, &constants);
-
-            let mut got = vals;
-            external_terminal_permute_w8(&mut got, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(got[i]), canon(expected[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_initial_permute_dual(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(46, 4);
-
-            // Run single-lane on each independently as reference.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            external_initial_permute_state_asm(&mut ref0, &constants);
-            external_initial_permute_state_asm(&mut ref1, &constants);
-
-            // The dual version processes both at once.
-            let mut l0 = vals0;
-            let mut l1 = vals1;
-            external_initial_permute_dual(&mut l0, &mut l1, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_terminal_permute_dual(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(47, 4);
-
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            external_terminal_permute_state_asm(&mut ref0, &constants);
-            external_terminal_permute_state_asm(&mut ref1, &constants);
-
-            let mut l0 = vals0;
-            let mut l1 = vals1;
-            external_terminal_permute_dual(&mut l0, &mut l1, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_initial_permute_dual_w8(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(48, 4);
-
-            // The generic dual version is the reference.
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            external_initial_permute_dual(&mut ref0, &mut ref1, &constants);
-
-            // The W8-specialized dual must match.
-            let mut l0 = vals0;
-            let mut l1 = vals1;
-            external_initial_permute_dual_w8(&mut l0, &mut l1, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_terminal_permute_dual_w8(
-            vals0 in prop::array::uniform8(any::<u64>()),
-            vals1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(49, 4);
-
-            let mut ref0 = vals0;
-            let mut ref1 = vals1;
-            external_terminal_permute_dual(&mut ref0, &mut ref1, &constants);
-
-            let mut l0 = vals0;
-            let mut l1 = vals1;
-            external_terminal_permute_dual_w8(&mut l0, &mut l1, &constants);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(l0[i]), canon(ref0[i]));
-                prop_assert_eq!(canon(l1[i]), canon(ref1[i]));
-            }
-        }
-
-        #[test]
-        fn test_add_neon(a0: u64, a1: u64, b0: u64, b1: u64) {
-            unsafe {
-                // Pack two lanes into NEON vectors, add, then read back.
-                let (r0, r1) = read_neon(add_neon(make_neon(a0, a1), make_neon(b0, b1)));
-
-                // Each lane must match its scalar add_asm equivalent.
-                prop_assert_eq!(canon(r0), canon(add_asm(a0, b0)));
-                prop_assert_eq!(canon(r1), canon(add_asm(a1, b1)));
-            }
-        }
-
-        #[test]
-        fn test_sub_neon(a0: u64, a1: u64, b0: u64, b1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(sub_neon(make_neon(a0, a1), make_neon(b0, b1)));
-
-                prop_assert_eq!(canon(r0), canon(sub_asm(a0, b0)));
-                prop_assert_eq!(canon(r1), canon(sub_asm(a1, b1)));
-            }
-        }
-
-        #[test]
-        fn test_double_neon(a0: u64, a1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(double_neon(make_neon(a0, a1)));
-
-                prop_assert_eq!(canon(r0), canon(double_asm(a0)));
-                prop_assert_eq!(canon(r1), canon(double_asm(a1)));
-            }
-        }
-
-        #[test]
-        fn test_div2_neon(a0: u64, a1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(div2_neon(make_neon(a0, a1)));
-
-                prop_assert_eq!(canon(r0), canon(div2_asm(a0)));
-                prop_assert_eq!(canon(r1), canon(div2_asm(a1)));
-            }
-        }
-
-        #[test]
-        fn test_div4_neon(a0: u64, a1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(div4_neon(make_neon(a0, a1)));
-
-                prop_assert_eq!(canon(r0), canon(div4_asm(a0)));
-                prop_assert_eq!(canon(r1), canon(div4_asm(a1)));
-            }
-        }
-
-        #[test]
-        fn test_div8_neon(a0: u64, a1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(div8_neon(make_neon(a0, a1)));
-
-                prop_assert_eq!(canon(r0), canon(div8_asm(a0)));
-                prop_assert_eq!(canon(r1), canon(div8_asm(a1)));
-            }
-        }
-
-        #[test]
-        fn test_div16_neon(a0: u64, a1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(div16_neon(make_neon(a0, a1)));
-
-                prop_assert_eq!(canon(r0), canon(div16_asm(a0)));
-                prop_assert_eq!(canon(r1), canon(div16_asm(a1)));
-            }
-        }
-
-        #[test]
-        fn test_div32_neon(a0: u64, a1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(div32_neon(make_neon(a0, a1)));
-
-                prop_assert_eq!(canon(r0), canon(div32_asm(a0)));
-                prop_assert_eq!(canon(r1), canon(div32_asm(a1)));
-            }
-        }
-
-        #[test]
-        fn test_div_2_32_neon(a0: u64, a1: u64) {
-            unsafe {
-                let (r0, r1) = read_neon(div_2_32_neon(make_neon(a0, a1)));
-
-                prop_assert_eq!(canon(r0), canon(div_2_32_asm(a0)));
-                prop_assert_eq!(canon(r1), canon(div_2_32_asm(a1)));
-            }
-        }
-
-        #[test]
-        fn test_apply_mat4_neon(
-            a0: u64, a1: u64, a2: u64, a3: u64,
-            b0: u64, b1: u64, b2: u64, b3: u64,
-        ) {
-            unsafe {
-                // Scalar reference: run apply_mat4_asm on each lane separately.
-                let mut lane_a = [a0, a1, a2, a3];
-                let mut lane_b = [b0, b1, b2, b3];
-                apply_mat4_asm(&mut lane_a);
-                apply_mat4_asm(&mut lane_b);
-
-                // NEON version: pack both lanes into vectors, apply, read back.
-                let mut neon_state = [
-                    make_neon(a0, b0),
-                    make_neon(a1, b1),
-                    make_neon(a2, b2),
-                    make_neon(a3, b3),
-                ];
-                apply_mat4_neon(&mut neon_state);
-
-                for i in 0..4 {
-                    let (r0, r1) = read_neon(neon_state[i]);
-                    prop_assert_eq!(canon(r0), canon(lane_a[i]));
-                    prop_assert_eq!(canon(r1), canon(lane_b[i]));
-                }
-            }
-        }
-
-        #[test]
-        fn test_mds_light_neon_w8(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            unsafe {
-                // Run scalar MDS on each lane independently.
-                let mut ref_a = lane_a;
-                let mut ref_b = lane_b;
-                mds_light_permutation_asm(&mut ref_a);
-                mds_light_permutation_asm(&mut ref_b);
-
-                // Pack both lanes into NEON vectors and run the NEON MDS.
-                let mut neon_state: [uint64x2_t; 8] =
-                    core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i]));
-                mds_light_neon(&mut neon_state);
-
-                // Each lane of each vector must match the scalar reference.
-                for i in 0..8 {
-                    let (r0, r1) = read_neon(neon_state[i]);
-                    prop_assert_eq!(canon(r0), canon(ref_a[i]));
-                    prop_assert_eq!(canon(r1), canon(ref_b[i]));
-                }
-            }
-        }
-
-        #[test]
-        fn test_sbox_neon(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            unsafe {
-                // Scalar reference on each lane.
-                let mut ref_a = lane_a;
-                let mut ref_b = lane_b;
-                sbox_layer_asm(&mut ref_a);
-                sbox_layer_asm(&mut ref_b);
-
-                // NEON version on packed lanes.
-                let mut neon_state: [uint64x2_t; 8] =
-                    core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i]));
-                sbox_neon(&mut neon_state);
-
-                for i in 0..8 {
-                    let (r0, r1) = read_neon(neon_state[i]);
-                    prop_assert_eq!(canon(r0), canon(ref_a[i]));
-                    prop_assert_eq!(canon(r1), canon(ref_b[i]));
-                }
-            }
-        }
-
-        #[test]
-        fn test_external_round_neon(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-            rc in prop::array::uniform8(any::<u64>()),
-        ) {
-            unsafe {
-                // Scalar reference on each lane.
-                let mut ref_a = lane_a;
-                let mut ref_b = lane_b;
-                external_round_asm(&mut ref_a, &rc);
-                external_round_asm(&mut ref_b, &rc);
-
-                // NEON version on packed lanes.
-                let mut neon_state: [uint64x2_t; 8] =
-                    core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i]));
-                external_round_neon(&mut neon_state, &rc);
-
-                for i in 0..8 {
-                    let (r0, r1) = read_neon(neon_state[i]);
-                    prop_assert_eq!(canon(r0), canon(ref_a[i]));
-                    prop_assert_eq!(canon(r1), canon(ref_b[i]));
-                }
-            }
-        }
-
-        #[test]
-        fn test_lanes_roundtrip(
-            lane0 in prop::array::uniform8(any::<u64>()),
-            lane1 in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Pack two lane arrays into NEON vectors.
-            let packed = lanes_to_neon(&lane0, &lane1);
-
-            // Unpack back into separate arrays.
-            let mut out0 = [0u64; 8];
-            let mut out1 = [0u64; 8];
-            neon_to_lanes(&packed, &mut out0, &mut out1);
-
-            // Must recover the original values.
-            prop_assert_eq!(out0, lane0);
-            prop_assert_eq!(out1, lane1);
-        }
-
-        #[test]
-        fn test_external_initial_neon(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(50, 4);
-
-            // Scalar reference on each lane.
-            let mut ref_a = lane_a;
-            let mut ref_b = lane_b;
-            external_initial_permute_state_asm(&mut ref_a, &constants);
-            external_initial_permute_state_asm(&mut ref_b, &constants);
-
-            // NEON version on packed lanes.
-            let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
-            external_initial_neon(&mut neon_state, &constants);
-
-            let mut out_a = [0u64; 8];
-            let mut out_b = [0u64; 8];
-            neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(out_a[i]), canon(ref_a[i]));
-                prop_assert_eq!(canon(out_b[i]), canon(ref_b[i]));
-            }
-        }
-
-        #[test]
-        fn test_external_terminal_neon(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            let constants = make_round_constants::<8>(51, 4);
-
-            let mut ref_a = lane_a;
-            let mut ref_b = lane_b;
-            external_terminal_permute_state_asm(&mut ref_a, &constants);
-            external_terminal_permute_state_asm(&mut ref_b, &constants);
-
-            let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
-            external_terminal_neon(&mut neon_state, &constants);
-
-            let mut out_a = [0u64; 8];
-            let mut out_b = [0u64; 8];
-            neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
-
-            for i in 0..8 {
-                prop_assert_eq!(canon(out_a[i]), canon(ref_a[i]));
-                prop_assert_eq!(canon(out_b[i]), canon(ref_b[i]));
-            }
-        }
-    }
-
-    fn test_internal_neon_matches_scalar<const WIDTH: usize>(
-        diag: [F; WIDTH],
-        neon_fn: fn(&mut [uint64x2_t; WIDTH], &[u64]),
-        scalar_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]),
-    ) {
-        let mut rng = SmallRng::seed_from_u64(55);
-
-        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
-        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
-
-        let lane_a: [u64; WIDTH] = rng.random();
-        let lane_b: [u64; WIDTH] = rng.random();
-
-        // Scalar reference on each lane independently.
-        let mut ref_a = lane_a;
-        let mut ref_b = lane_b;
-        scalar_fn(&mut ref_a, &diag_raw, &constants);
-        scalar_fn(&mut ref_b, &diag_raw, &constants);
-
-        // NEON version packs both lanes and processes them together.
-        let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
-        neon_fn(&mut neon_state, &constants);
-
-        let mut out_a = [0u64; WIDTH];
-        let mut out_b = [0u64; WIDTH];
-        neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
-
-        for i in 0..WIDTH {
-            assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}");
-            assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}");
-        }
-    }
-
-    #[test]
-    fn test_internal_permute_neon_w12() {
-        test_internal_neon_matches_scalar(
-            MATRIX_DIAG_12_GOLDILOCKS,
-            internal_permute_neon_w12,
-            internal_permute_state_asm,
-        );
-    }
-
-    #[test]
-    fn test_internal_permute_neon_w16() {
-        test_internal_neon_matches_scalar(
-            MATRIX_DIAG_16_GOLDILOCKS,
-            internal_permute_neon_w16,
-            internal_permute_state_asm,
-        );
-    }
-
-    fn test_internal_neon_generic_matches_scalar<const WIDTH: usize>(diag: [F; WIDTH]) {
-        let mut rng = SmallRng::seed_from_u64(66);
-
-        let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value);
-        let constants: Vec<u64> = (0..22).map(|_| rng.random()).collect();
-
-        let lane_a: [u64; WIDTH] = rng.random();
-        let lane_b: [u64; WIDTH] = rng.random();
-
-        // Scalar reference.
-        let mut ref_a = lane_a;
-        let mut ref_b = lane_b;
-        internal_permute_state_asm(&mut ref_a, &diag_raw, &constants);
-        internal_permute_state_asm(&mut ref_b, &diag_raw, &constants);
-
-        // Generic NEON version.
-        let mut neon_state = lanes_to_neon(&lane_a, &lane_b);
-        internal_permute_neon(&mut neon_state, &diag_raw, &constants);
-
-        let mut out_a = [0u64; WIDTH];
-        let mut out_b = [0u64; WIDTH];
-        neon_to_lanes(&neon_state, &mut out_a, &mut out_b);
-
-        for i in 0..WIDTH {
-            assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}");
-            assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}");
-        }
-    }
-
-    #[test]
-    fn test_internal_permute_neon_generic_w8() {
-        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_8_GOLDILOCKS);
-    }
-
-    #[test]
-    fn test_internal_permute_neon_generic_w12() {
-        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_12_GOLDILOCKS);
-    }
-
-    #[test]
-    fn test_internal_permute_neon_generic_w16() {
-        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_16_GOLDILOCKS);
-    }
-
-    #[test]
-    fn test_internal_permute_neon_generic_w20() {
-        test_internal_neon_generic_matches_scalar(MATRIX_DIAG_20_GOLDILOCKS);
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs
deleted file mode 100644
index 3d1951a57..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs
+++ /dev/null
@@ -1,400 +0,0 @@
-//! Shared utilities for Goldilocks NEON assembly.
-
-use core::arch::asm;
-
-use super::packing::PackedGoldilocksNeon;
-use crate::{Goldilocks, P};
-
-const EPSILON: u64 = P.wrapping_neg(); // 2^32 - 1
-
-// ---------------------------------------------------------------------------
-// Scalar field arithmetic (inline assembly)
-// ---------------------------------------------------------------------------
-
-/// Multiply two Goldilocks elements using inline assembly.
-///
-/// Computes `a * b mod P` where P = 2^64 - 2^32 + 1. The reduction
-/// uses the identity `2^64 = 2^32 - 1 (mod P)` (i.e. EPSILON) to fold
-/// the 128-bit product back into a single limb.
-#[inline(always)]
-pub(super) unsafe fn mul_asm(a: u64, b: u64) -> u64 {
-    let _lo: u64;
-    let _hi: u64;
-    let _t0: u64;
-    let _t1: u64;
-    let _t2: u64;
-    let result: u64;
-
-    unsafe {
-        asm!(
-            // Compute 128-bit product: hi:lo = a * b
-            "mul   {lo}, {a}, {b}",
-            "umulh {hi}, {a}, {b}",
-
-            // Reduce: result = lo - hi_hi + hi_lo * EPSILON
-            // where hi = hi_hi * 2^32 + hi_lo
-
-            // t0 = lo - (hi >> 32), with borrow detection
-            "lsr   {t0}, {hi}, #32",          // t0 = hi >> 32
-            "subs  {t1}, {lo}, {t0}",         // t1 = lo - t0, set flags
-            "csetm {t2:w}, cc",               // t2 = -1 if borrow, 0 otherwise
-            "sub   {t1}, {t1}, {t2}",         // Adjust for borrow (subtract EPSILON)
-
-            // t0 = (hi & EPSILON) * EPSILON
-            "and   {t0}, {hi}, {epsilon}",    // t0 = hi & EPSILON
-            "mul   {t0}, {t0}, {epsilon}",    // t0 = t0 * EPSILON
-
-            // result = t1 + t0, with overflow detection
-            "adds  {result}, {t1}, {t0}",     // result = t1 + t0, set flags
-            "csetm {t2:w}, cs",               // t2 = -1 if carry, 0 otherwise
-            "add   {result}, {result}, {t2}", // Add EPSILON on overflow
-
-            a = in(reg) a,
-            b = in(reg) b,
-            epsilon = in(reg) EPSILON,
-            lo = out(reg) _lo,
-            hi = out(reg) _hi,
-            t0 = out(reg) _t0,
-            t1 = out(reg) _t1,
-            t2 = out(reg) _t2,
-            result = out(reg) result,
-            options(pure, nomem, nostack),
-        );
-    }
-
-    result
-}
-
-/// Compute `a * b + c` in the Goldilocks field using inline assembly.
-///
-/// Fused multiply-add: forms the 128-bit product `a * b`, adds `c` into
-/// the low limb (with carry propagation), then reduces modulo P.
-#[inline(always)]
-pub(super) unsafe fn mul_add_asm(a: u64, b: u64, c: u64) -> u64 {
-    let _lo: u64;
-    let _hi: u64;
-    let _t0: u64;
-    let _t1: u64;
-    let _t2: u64;
-    let result: u64;
-
-    unsafe {
-        asm!(
-            // Compute 128-bit product: hi:lo = a * b
-            "mul   {lo}, {a}, {b}",
-            "umulh {hi}, {a}, {b}",
-
-            // Accumulate c into the 128-bit product: hi:lo = hi:lo + c
-            "adds  {lo}, {lo}, {c}",
-            "adc   {hi}, {hi}, xzr",
-
-            // Reduce: result = lo - hi_hi + hi_lo * EPSILON
-            // where hi = hi_hi * 2^32 + hi_lo
-
-            // t0 = lo - (hi >> 32), with borrow detection
-            "lsr   {t0}, {hi}, #32",          // t0 = hi >> 32
-            "subs  {t1}, {lo}, {t0}",         // t1 = lo - t0, set flags
-            "csetm {t2:w}, cc",               // t2 = -1 if borrow, 0 otherwise
-            "sub   {t1}, {t1}, {t2}",         // Adjust for borrow (subtract EPSILON)
-
-            // t0 = (hi & EPSILON) * EPSILON
-            "and   {t0}, {hi}, {epsilon}",    // t0 = hi & EPSILON
-            "mul   {t0}, {t0}, {epsilon}",    // t0 = t0 * EPSILON
-
-            // result = t1 + t0, with overflow detection
-            "adds  {result}, {t1}, {t0}",     // result = t1 + t0, set flags
-            "csetm {t2:w}, cs",               // t2 = -1 if carry, 0 otherwise
-            "add   {result}, {result}, {t2}", // Add EPSILON on overflow
-
-            a = in(reg) a,
-            b = in(reg) b,
-            c = in(reg) c,
-            epsilon = in(reg) EPSILON,
-            lo = out(reg) _lo,
-            hi = out(reg) _hi,
-            t0 = out(reg) _t0,
-            t1 = out(reg) _t1,
-            t2 = out(reg) _t2,
-            result = out(reg) result,
-            options(pure, nomem, nostack),
-        );
-    }
-
-    result
-}
-
-/// Add two Goldilocks elements with overflow handling using inline assembly.
-///
-/// Computes `a + b mod P`. On overflow (carry out of 64 bits), subtracts
-/// P by adding EPSILON (which equals -P mod 2^64, i.e. 2^32 - 1).
-#[inline(always)]
-pub(super) unsafe fn add_asm(a: u64, b: u64) -> u64 {
-    let result: u64;
-    let _adj: u64;
-
-    unsafe {
-        asm!(
-            "adds  {result}, {a}, {b}",
-            "csetm {adj:w}, cs",
-            "add   {result}, {result}, {adj}",
-            a = in(reg) a,
-            b = in(reg) b,
-            result = out(reg) result,
-            adj = out(reg) _adj,
-            options(pure, nomem, nostack),
-        );
-    }
-
-    result
-}
-
-// ---------------------------------------------------------------------------
-// Lane conversion (packed NEON <-> raw u64 arrays)
-// ---------------------------------------------------------------------------
-
-/// Unpack a packed NEON state into two raw `u64` lane arrays.
-///
-/// Each packed slot contains two Goldilocks elements (lane 0, lane 1).
-/// This function extracts the internal `u64` representation of each
-/// element into two separate arrays, one per lane.
-///
-/// # Layout
-///
-/// ```text
-///     packed[i] = (field_elem_a, field_elem_b)
-///
-///     lane0[i] = field_elem_a.value    (raw u64)
-///     lane1[i] = field_elem_b.value    (raw u64)
-/// ```
-#[inline]
-pub(super) fn unpack_lanes<const WIDTH: usize>(
-    state: &[PackedGoldilocksNeon; WIDTH],
-) -> ([u64; WIDTH], [u64; WIDTH]) {
-    // Extract the raw u64 representation from each packed slot.
-    let lane0: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[0].value);
-    let lane1: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[1].value);
-    (lane0, lane1)
-}
-
-/// Pack two raw `u64` lane arrays back into a packed NEON state.
-///
-/// Each raw value is wrapped into a Goldilocks field element (with
-/// reduction modulo P) and paired into a packed slot.
-///
-/// # Layout
-///
-/// ```text
-///     lane0[i], lane1[i]  ->  packed[i] = (Goldilocks(lane0[i]), Goldilocks(lane1[i]))
-/// ```
-#[inline]
-pub(super) fn pack_lanes<const WIDTH: usize>(
-    state: &mut [PackedGoldilocksNeon; WIDTH],
-    lane0: &[u64; WIDTH],
-    lane1: &[u64; WIDTH],
-) {
-    for i in 0..WIDTH {
-        // Wrap each raw u64 into a field element and pair them.
-        state[i] = PackedGoldilocksNeon([Goldilocks::new(lane0[i]), Goldilocks::new(lane1[i])]);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field::{PrimeCharacteristicRing, PrimeField64};
-    use proptest::prelude::*;
-
-    use super::*;
-
-    type F = Goldilocks;
-
-    /// Reduce a raw `u64` to its canonical Goldilocks representative.
-    fn canon(x: u64) -> u64 {
-        F::new(x).as_canonical_u64()
-    }
-
-    proptest! {
-        // ----------------------------------------------------------------
-        // Scalar field arithmetic
-        // ----------------------------------------------------------------
-
-        /// Verify ASM addition against field addition.
-        #[test]
-        fn test_add_asm(a: u64, b: u64) {
-            let expected = (F::new(a) + F::new(b)).as_canonical_u64();
-            let got = canon(unsafe { add_asm(a, b) });
-            prop_assert_eq!(got, expected);
-        }
-
-        /// Verify ASM multiplication against field multiplication.
-        #[test]
-        fn test_mul_asm(a: u64, b: u64) {
-            let expected = (F::new(a) * F::new(b)).as_canonical_u64();
-            let got = canon(unsafe { mul_asm(a, b) });
-            prop_assert_eq!(got, expected);
-        }
-
-        /// Verify ASM fused multiply-add against field multiply-add.
-        #[test]
-        fn test_mul_add_asm(a: u64, b: u64, c: u64) {
-            let expected = (F::new(a) * F::new(b) + F::new(c)).as_canonical_u64();
-            let got = canon(unsafe { mul_add_asm(a, b, c) });
-            prop_assert_eq!(got, expected);
-        }
-
-        // ----------------------------------------------------------------
-        // Unpack: packed state -> two raw u64 lane arrays
-        // ----------------------------------------------------------------
-
-        #[test]
-        fn test_unpack_lanes_w8(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Build a packed state from two independent lane arrays.
-            let packed: [PackedGoldilocksNeon; 8] =
-                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
-
-            // Unpack into raw u64 lane arrays.
-            let (got0, got1) = unpack_lanes(&packed);
-
-            // Each raw value must be the internal representation of the field element.
-            for i in 0..8 {
-                prop_assert_eq!(got0[i], F::new(lane_a[i]).value);
-                prop_assert_eq!(got1[i], F::new(lane_b[i]).value);
-            }
-        }
-
-        #[test]
-        fn test_unpack_lanes_w12(
-            lane_a in prop::array::uniform12(any::<u64>()),
-            lane_b in prop::array::uniform12(any::<u64>()),
-        ) {
-            // Same verification, width 12.
-            let packed: [PackedGoldilocksNeon; 12] =
-                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
-
-            let (got0, got1) = unpack_lanes(&packed);
-
-            for i in 0..12 {
-                prop_assert_eq!(got0[i], F::new(lane_a[i]).value);
-                prop_assert_eq!(got1[i], F::new(lane_b[i]).value);
-            }
-        }
-
-        // ----------------------------------------------------------------
-        // Pack: two raw u64 lane arrays -> packed state
-        // ----------------------------------------------------------------
-
-        #[test]
-        fn test_pack_lanes_w8(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Pack two raw lane arrays into packed state.
-            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
-            pack_lanes(&mut packed, &lane_a, &lane_b);
-
-            // Each packed element must hold the two corresponding field elements.
-            for i in 0..8 {
-                prop_assert_eq!(packed[i].0[0], F::new(lane_a[i]));
-                prop_assert_eq!(packed[i].0[1], F::new(lane_b[i]));
-            }
-        }
-
-        #[test]
-        fn test_pack_lanes_w12(
-            lane_a in prop::array::uniform12(any::<u64>()),
-            lane_b in prop::array::uniform12(any::<u64>()),
-        ) {
-            // Same verification, width 12.
-            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
-            pack_lanes(&mut packed, &lane_a, &lane_b);
-
-            for i in 0..12 {
-                prop_assert_eq!(packed[i].0[0], F::new(lane_a[i]));
-                prop_assert_eq!(packed[i].0[1], F::new(lane_b[i]));
-            }
-        }
-
-        // ----------------------------------------------------------------
-        // Roundtrip: pack then unpack recovers canonical values
-        // ----------------------------------------------------------------
-
-        #[test]
-        fn test_roundtrip_pack_unpack_w8(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Pack two lane arrays, then unpack them.
-            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
-            pack_lanes(&mut packed, &lane_a, &lane_b);
-            let (out0, out1) = unpack_lanes(&packed);
-
-            // The canonical form of the recovered values must match.
-            for i in 0..8 {
-                prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64());
-                prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64());
-            }
-        }
-
-        #[test]
-        fn test_roundtrip_pack_unpack_w12(
-            lane_a in prop::array::uniform12(any::<u64>()),
-            lane_b in prop::array::uniform12(any::<u64>()),
-        ) {
-            // Same roundtrip, width 12.
-            let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
-            pack_lanes(&mut packed, &lane_a, &lane_b);
-            let (out0, out1) = unpack_lanes(&packed);
-
-            for i in 0..12 {
-                prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64());
-                prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64());
-            }
-        }
-
-        // ----------------------------------------------------------------
-        // Roundtrip: unpack then pack preserves packed state
-        // ----------------------------------------------------------------
-
-        #[test]
-        fn test_roundtrip_unpack_pack_w8(
-            lane_a in prop::array::uniform8(any::<u64>()),
-            lane_b in prop::array::uniform8(any::<u64>()),
-        ) {
-            // Start from a packed state.
-            let original: [PackedGoldilocksNeon; 8] =
-                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
-
-            // Unpack into raw lanes, then pack back.
-            let (raw0, raw1) = unpack_lanes(&original);
-            let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
-            pack_lanes(&mut restored, &raw0, &raw1);
-
-            // The restored packed state must equal the original.
-            for i in 0..8 {
-                prop_assert_eq!(restored[i].0[0], original[i].0[0]);
-                prop_assert_eq!(restored[i].0[1], original[i].0[1]);
-            }
-        }
-
-        #[test]
-        fn test_roundtrip_unpack_pack_w12(
-            lane_a in prop::array::uniform12(any::<u64>()),
-            lane_b in prop::array::uniform12(any::<u64>()),
-        ) {
-            // Same reverse roundtrip, width 12.
-            let original: [PackedGoldilocksNeon; 12] =
-                core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
-
-            let (raw0, raw1) = unpack_lanes(&original);
-            let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
-            pack_lanes(&mut restored, &raw0, &raw1);
-
-            for i in 0..12 {
-                prop_assert_eq!(restored[i].0[0], original[i].0[0]);
-                prop_assert_eq!(restored[i].0[1], original[i].0[1]);
-            }
-        }
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs
deleted file mode 100644
index 5ac38a28b..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs
+++ /dev/null
@@ -1,217 +0,0 @@
-use p3_field::extension::{
-    BinomiallyExtendable, BinomiallyExtendableAlgebra, HasTwoAdicBinomialExtension,
-};
-use p3_field::{PrimeCharacteristicRing, TwoAdicField, field_to_array};
-
-use crate::Goldilocks;
-
-impl BinomiallyExtendableAlgebra<Self, 2> for Goldilocks {}
-
-impl BinomiallyExtendable<2> for Goldilocks {
-    // Verifiable in Sage with
-    // `R.<x> = GF(p)[]; assert (x^2 - 7).is_irreducible()`.
-    const W: Self = Self::new(7);
-
-    // DTH_ROOT = W^((p - 1)/2).
-    const DTH_ROOT: Self = Self::new(18446744069414584320);
-
-    const EXT_GENERATOR: [Self; 2] = [
-        Self::new(18081566051660590251),
-        Self::new(16121475356294670766),
-    ];
-}
-
-impl HasTwoAdicBinomialExtension<2> for Goldilocks {
-    const EXT_TWO_ADICITY: usize = 33;
-
-    fn ext_two_adic_generator(bits: usize) -> [Self; 2] {
-        assert!(bits <= 33);
-
-        if bits == 33 {
-            [Self::ZERO, Self::new(15659105665374529263)]
-        } else {
-            [Self::two_adic_generator(bits), Self::ZERO]
-        }
-    }
-}
-
-impl BinomiallyExtendableAlgebra<Self, 3> for Goldilocks {}
-
-impl BinomiallyExtendable<3> for Goldilocks {
-    // Verifiable in Sage with
-    // `R.<x> = GF(p)[]; assert (x^3 - 2).is_irreducible()`.
-    // Same irreducible as Lambda's Degree3GoldilocksExtensionField.
-    const W: Self = Self::new(2);
-
-    // DTH_ROOT = primitive 3rd root of unity = 7^((p-1)/3) mod p.
-    const DTH_ROOT: Self = Self::new(18446744065119617025);
-
-    // Generator of GF(p^3)* = 5 + w. Verified: passes order checks for
-    // all small prime factors of p^3 - 1.
-    const EXT_GENERATOR: [Self; 3] = [Self::new(5), Self::ONE, Self::ZERO];
-}
-
-impl HasTwoAdicBinomialExtension<3> for Goldilocks {
-    // v_2(p^3 - 1) = v_2(p-1) + v_2(p^2+p+1) = 32 + 0 = 32.
-    const EXT_TWO_ADICITY: usize = 32;
-
-    fn ext_two_adic_generator(bits: usize) -> [Self; 3] {
-        assert!(bits <= 32);
-        field_to_array(Self::two_adic_generator(bits))
-    }
-}
-
-impl BinomiallyExtendableAlgebra<Self, 5> for Goldilocks {}
-
-impl BinomiallyExtendable<5> for Goldilocks {
-    // Verifiable via:
-    //  ```sage
-    //  # Define Fp
-    //  p = 2**64 - 2**32 + 1
-    //  F = GF(p)
-
-    //  # Define Fp[z]
-    //  R.<z> = PolynomialRing(F)
-
-    //  # The polynomial x^5-3 is irreducible
-    //  assert(R(z^5-3).is_irreducible())
-    //  ```
-    const W: Self = Self::new(3);
-
-    // 5-th root = w^((p - 1)/5)
-    const DTH_ROOT: Self = Self::new(1041288259238279555);
-
-    // Generator of the extension field
-    // Obtained by finding the smallest Hamming weight vector
-    // with appropriate order, starting at [0,1,0,0,0]
-    const EXT_GENERATOR: [Self; 5] = [Self::TWO, Self::ONE, Self::ZERO, Self::ZERO, Self::ZERO];
-}
-
-impl HasTwoAdicBinomialExtension<5> for Goldilocks {
-    const EXT_TWO_ADICITY: usize = 32;
-
-    fn ext_two_adic_generator(bits: usize) -> [Self; 5] {
-        assert!(bits <= 32);
-
-        field_to_array(Self::two_adic_generator(bits))
-    }
-}
-
-#[cfg(test)]
-mod test_quadratic_extension {
-
-    use num_bigint::BigUint;
-    use p3_field::extension::BinomialExtensionField;
-    use p3_field::{ExtensionField, PrimeCharacteristicRing};
-    use p3_field_testing::{
-        test_extension_field, test_field, test_packed_extension_field,
-        test_two_adic_extension_field,
-    };
-
-    use crate::Goldilocks;
-
-    type F = Goldilocks;
-    type EF = BinomialExtensionField<F, 2>;
-
-    // There is a redundant representation of zero but we already tested it
-    // when testing the base field.
-    const ZEROS: [EF; 1] = [EF::ZERO];
-    const ONES: [EF; 1] = [EF::ONE];
-
-    // Get the prime factorization of the order of the multiplicative group.
-    // i.e. the prime factorization of P^2 - 1.
-    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 9] {
-        [
-            (BigUint::from(2u8), 33),
-            (BigUint::from(3u8), 1),
-            (BigUint::from(5u8), 1),
-            (BigUint::from(7u8), 1),
-            (BigUint::from(17u8), 1),
-            (BigUint::from(179u8), 1),
-            (BigUint::from(257u16), 1),
-            (BigUint::from(65537u32), 1),
-            (BigUint::from(7361031152998637u64), 1),
-        ]
-    }
-
-    test_field!(
-        super::EF,
-        &super::ZEROS,
-        &super::ONES,
-        &super::multiplicative_group_prime_factorization()
-    );
-
-    test_extension_field!(super::F, super::EF);
-    test_two_adic_extension_field!(super::F, super::EF);
-
-    type Pef = <EF as ExtensionField<F>>::ExtensionPacking;
-    const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO];
-    const PACKED_ONES: [Pef; 1] = [Pef::ONE];
-    test_packed_extension_field!(
-        super::F,
-        super::EF,
-        super::Pef,
-        &super::PACKED_ZEROS,
-        &super::PACKED_ONES
-    );
-}
-
-#[cfg(test)]
-mod test_quintic_extension {
-
-    use num_bigint::BigUint;
-    use p3_field::extension::BinomialExtensionField;
-    use p3_field::{ExtensionField, PrimeCharacteristicRing};
-    use p3_field_testing::{
-        test_extension_field, test_field, test_packed_extension_field,
-        test_two_adic_extension_field,
-    };
-
-    use crate::Goldilocks;
-
-    type F = Goldilocks;
-    type EF = BinomialExtensionField<F, 5>;
-
-    // There is a redundant representation of zero but we already tested it
-    // when testing the base field.
-    const ZEROS: [EF; 1] = [EF::ZERO];
-    const ONES: [EF; 1] = [EF::ONE];
-
-    // Get the prime factorization of the order of the multiplicative group.
-    // i.e. the prime factorization of P^5 - 1.
-    fn multiplicative_group_prime_factorization() -> [(num_bigint::BigUint, u32); 10] {
-        [
-            (BigUint::from(2u8), 32),
-            (BigUint::from(3u8), 1),
-            (BigUint::from(5u8), 2),
-            (BigUint::from(17u8), 1),
-            (BigUint::from(257u16), 1),
-            (BigUint::from(45971u16), 1),
-            (BigUint::from(65537u32), 1),
-            (BigUint::from(255006435240067831u64), 1),
-            (BigUint::from(280083648770327405561u128), 1),
-            (BigUint::from(7053197395277272939628824863222181u128), 1),
-        ]
-    }
-
-    test_field!(
-        super::EF,
-        &super::ZEROS,
-        &super::ONES,
-        &super::multiplicative_group_prime_factorization()
-    );
-
-    test_extension_field!(super::F, super::EF);
-    test_two_adic_extension_field!(super::F, super::EF);
-
-    type Pef = <EF as ExtensionField<F>>::ExtensionPacking;
-    const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO];
-    const PACKED_ONES: [Pef; 1] = [Pef::ONE];
-    test_packed_extension_field!(
-        super::F,
-        super::EF,
-        super::Pef,
-        &super::PACKED_ZEROS,
-        &super::PACKED_ONES
-    );
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs
deleted file mode 100644
index ebe3f8c7a..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs
+++ /dev/null
@@ -1,813 +0,0 @@
-use alloc::vec;
-use alloc::vec::Vec;
-use core::fmt::{Debug, Display, Formatter};
-use core::hash::{Hash, Hasher};
-use core::iter::{Product, Sum};
-use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
-use core::{array, fmt};
-
-use num_bigint::BigUint;
-use p3_challenger::UniformSamplingField;
-use p3_field::exponentiation::exp_10540996611094048183;
-use p3_field::integers::QuotientMap;
-use p3_field::op_assign_macros::{
-    impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign,
-};
-use p3_field::{
-    Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField,
-    PrimeField64, RawDataSerializable, TwoAdicField, halve_u64, impl_raw_serializable_primefield64,
-    quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int,
-};
-use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner};
-use rand::Rng;
-use rand::distr::{Distribution, StandardUniform};
-use serde::{Deserialize, Serialize};
-
-/// The Goldilocks prime
-pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001;
-
-/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
-///
-/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64.
-#[derive(Copy, Clone, Default, Serialize, Deserialize)]
-#[repr(transparent)] // Important for reasoning about memory layout
-#[must_use]
-pub struct Goldilocks {
-    /// Not necessarily canonical.
-    pub(crate) value: u64,
-}
-
-impl Goldilocks {
-    /// Create a new field element from any `u64`.
-    ///
-    /// Any `u64` value is accepted. No reduction is performed since
-    /// Goldilocks uses a non-canonical internal representation.
-    #[inline]
-    pub const fn new(value: u64) -> Self {
-        Self { value }
-    }
-
-    /// Convert a `[u64; N]` array to an array of field elements.
-    ///
-    /// Const version of `input.map(Goldilocks::new)`.
-    #[inline]
-    pub const fn new_array<const N: usize>(input: [u64; N]) -> [Self; N] {
-        let mut output = [Self::ZERO; N];
-        let mut i = 0;
-        while i < N {
-            output[i].value = input[i];
-            i += 1;
-        }
-        output
-    }
-
-    /// Convert a `[[u64; N]; M]` array to a 2D array of field elements.
-    ///
-    /// Const version of `input.map(Goldilocks::new_array)`.
-    #[inline]
-    pub const fn new_2d_array<const N: usize, const M: usize>(
-        input: [[u64; N]; M],
-    ) -> [[Self; N]; M] {
-        let mut output = [[Self::ZERO; N]; M];
-        let mut i = 0;
-        while i < M {
-            output[i] = Self::new_array(input[i]);
-            i += 1;
-        }
-        output
-    }
-
-    /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`.
-    const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg();
-
-    /// A list of generators for the two-adic subgroups of the goldilocks field.
-    ///
-    /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`.
-    pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([
-        0x0000000000000001,
-        0xffffffff00000000,
-        0x0001000000000000,
-        0xfffffffeff000001,
-        0xefffffff00000001,
-        0x00003fffffffc000,
-        0x0000008000000000,
-        0xf80007ff08000001,
-        0xbf79143ce60ca966,
-        0x1905d02a5c411f4e,
-        0x9d8f2ad78bfed972,
-        0x0653b4801da1c8cf,
-        0xf2c35199959dfcb6,
-        0x1544ef2335d17997,
-        0xe0ee099310bba1e2,
-        0xf6b2cffe2306baac,
-        0x54df9630bf79450e,
-        0xabd0a6e8aa3d8a0e,
-        0x81281a7b05f9beac,
-        0xfbd41c6b8caa3302,
-        0x30ba2ecd5e93e76d,
-        0xf502aef532322654,
-        0x4b2a18ade67246b5,
-        0xea9d5a1336fbc98b,
-        0x86cdcc31c307e171,
-        0x4bbaf5976ecfefd8,
-        0xed41d05b78d6e286,
-        0x10d78dd8915a171d,
-        0x59049500004a4485,
-        0xdfa8c93ba46d2666,
-        0x7e9bd009b86a0845,
-        0x400a7f755588e659,
-        0x185629dcda58878c,
-    ]);
-
-    /// A list of powers of two from 0 to 95.
-    ///
-    /// Note that 2^{96} = -1 mod P so all powers of two can be simply
-    /// derived from this list.
-    const POWERS_OF_TWO: [Self; 96] = {
-        let mut powers_of_two = [Self::ONE; 96];
-
-        let mut i = 1;
-        while i < 64 {
-            powers_of_two[i] = Self::new(1 << i);
-            i += 1;
-        }
-        let mut var = Self::new(1 << 63);
-        while i < 96 {
-            var = const_add(var, var);
-            powers_of_two[i] = var;
-            i += 1;
-        }
-        powers_of_two
-    };
-}
-
-impl PartialEq for Goldilocks {
-    fn eq(&self, other: &Self) -> bool {
-        self.as_canonical_u64() == other.as_canonical_u64()
-    }
-}
-
-impl Eq for Goldilocks {}
-
-impl Packable for Goldilocks {}
-
-impl Hash for Goldilocks {
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        state.write_u64(self.as_canonical_u64());
-    }
-}
-
-impl Ord for Goldilocks {
-    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
-        self.as_canonical_u64().cmp(&other.as_canonical_u64())
-    }
-}
-
-impl PartialOrd for Goldilocks {
-    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl Display for Goldilocks {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        Display::fmt(&self.as_canonical_u64(), f)
-    }
-}
-
-impl Debug for Goldilocks {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        Debug::fmt(&self.as_canonical_u64(), f)
-    }
-}
-
-impl Distribution<Goldilocks> for StandardUniform {
-    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Goldilocks {
-        loop {
-            let next_u64 = rng.next_u64();
-            let is_canonical = next_u64 < Goldilocks::ORDER_U64;
-            if is_canonical {
-                return Goldilocks::new(next_u64);
-            }
-        }
-    }
-}
-
-impl UniformSamplingField for Goldilocks {
-    const MAX_SINGLE_SAMPLE_BITS: usize = 24;
-    const SAMPLING_BITS_M: [u64; 64] = {
-        let prime: u64 = P;
-        let mut a = [0u64; 64];
-        let mut k = 0;
-        while k < 64 {
-            if k == 0 {
-                a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always.
-            } else {
-                // Create a mask to zero out the last k bits
-                let mask = !((1u64 << k) - 1);
-                a[k] = prime & mask;
-            }
-            k += 1;
-        }
-        a
-    };
-}
-
-impl PrimeCharacteristicRing for Goldilocks {
-    type PrimeSubfield = Self;
-
-    const ZERO: Self = Self::new(0);
-    const ONE: Self = Self::new(1);
-    const TWO: Self = Self::new(2);
-    const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
-
-    #[inline]
-    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
-        f
-    }
-
-    #[inline]
-    fn from_bool(b: bool) -> Self {
-        Self::new(b.into())
-    }
-
-    #[inline]
-    fn halve(&self) -> Self {
-        Self::new(halve_u64::<P>(self.value))
-    }
-
-    #[inline]
-    fn mul_2exp_u64(&self, exp: u64) -> Self {
-        // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P.
-        if exp < 96 {
-            *self * Self::POWERS_OF_TWO[exp as usize]
-        } else if exp < 192 {
-            -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
-        } else {
-            self.mul_2exp_u64(exp % 192)
-        }
-    }
-
-    #[inline]
-    fn div_2exp_u64(&self, mut exp: u64) -> Self {
-        // In the goldilocks field, 2^192 = 1 mod P.
-        // Thus 2^{-n} = 2^{192 - n} mod P.
-        exp %= 192;
-        self.mul_2exp_u64(192 - exp)
-    }
-
-    #[inline]
-    fn sum_array<const N: usize>(input: &[Self]) -> Self {
-        assert_eq!(N, input.len());
-        // Benchmarking shows that for N <= 3 it's faster to sum the elements directly
-        // but for N > 3 it's faster to use the .sum() methods which passes through u128's
-        // allowing for delayed reductions.
-        match N {
-            0 => Self::ZERO,
-            1 => input[0],
-            2 => input[0] + input[1],
-            3 => input[0] + input[1] + input[2],
-            _ => input.iter().copied().sum(),
-        }
-    }
-
-    #[inline]
-    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
-        // The constant OFFSET has 2 important properties:
-        // 1. It is a multiple of P.
-        // 2. It is greater than the maximum possible value of the sum of the products of two u64s.
-        const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
-        assert!((N as u32) <= (1 << 31));
-        match N {
-            0 => Self::ZERO,
-            1 => lhs[0] * rhs[0],
-            2 => {
-                // We unroll the N = 2 case as it is slightly faster and this is an important case
-                // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension.
-                let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
-                let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
-
-                // We know that long_prod_0, long_prod_1 < OFFSET.
-                // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET.
-                let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
-                // Compiler really likes defining sum_corr here instead of in the if/else.
-                let sum_corr = sum.wrapping_sub(OFFSET);
-                if over {
-                    reduce128(sum_corr)
-                } else {
-                    reduce128(sum)
-                }
-            }
-            _ => {
-                let (lo_plus_hi, hi) = lhs
-                    .iter()
-                    .zip(rhs)
-                    .map(|(x, y)| (x.value as u128) * (y.value as u128))
-                    .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
-                        // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits.
-                        let val_hi = (val >> 96) as u64;
-                        // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi.
-                        // As N <= 2^32, acc_hi cannot overflow.
-                        unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
-                    });
-                // First, remove the hi part from lo_plus_hi.
-                let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
-                // As 2^{96} = -1 mod P, we simply need to reduce lo - hi.
-                // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow.
-                let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
-                reduce128(sum)
-            }
-        }
-    }
-
-    #[inline]
-    fn zero_vec(len: usize) -> Vec<Self> {
-        // SAFETY:
-        // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment
-        // and memory layout making `flatten_to_base` safe. This this will create
-        // a vector Goldilocks elements with value set to 0.
-        unsafe { flatten_to_base(vec![0u64; len]) }
-    }
-}
-
-/// Degree of the smallest permutation polynomial for Goldilocks.
-///
-/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
-impl InjectiveMonomial<7> for Goldilocks {}
-
-impl PermutationMonomial<7> for Goldilocks {
-    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
-    ///
-    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
-    fn injective_exp_root_n(&self) -> Self {
-        exp_10540996611094048183(*self)
-    }
-}
-
-impl RawDataSerializable for Goldilocks {
-    impl_raw_serializable_primefield64!();
-}
-
-impl Field for Goldilocks {
-    #[cfg(all(
-        target_arch = "x86_64",
-        target_feature = "avx2",
-        not(target_feature = "avx512f")
-    ))]
-    type Packing = crate::PackedGoldilocksAVX2;
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-    type Packing = crate::PackedGoldilocksAVX512;
-
-    // PATCHED for bench_vs_plonky3: disable NEON packing for apples-to-apples
-    // scalar comparison against Lambda STARK. Upstream: `crate::PackedGoldilocksNeon`.
-    #[cfg(target_arch = "aarch64")]
-    type Packing = Self;
-
-    #[cfg(not(any(
-        all(
-            target_arch = "x86_64",
-            target_feature = "avx2",
-            not(target_feature = "avx512f")
-        ),
-        all(target_arch = "x86_64", target_feature = "avx512f"),
-        target_arch = "aarch64",
-    )))]
-    type Packing = Self;
-
-    // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator()
-    const GENERATOR: Self = Self::new(7);
-
-    fn is_zero(&self) -> bool {
-        self.value == 0 || self.value == Self::ORDER_U64
-    }
-
-    fn try_inverse(&self) -> Option<Self> {
-        if self.is_zero() {
-            return None;
-        }
-
-        Some(gcd_inversion(*self))
-    }
-
-    #[inline]
-    fn order() -> BigUint {
-        P.into()
-    }
-}
-
-// We use macros to implement QuotientMap<Int> for all integer types except for u64 and i64.
-quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
-quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
-quotient_map_large_uint!(
-    Goldilocks,
-    u64,
-    Goldilocks::ORDER_U64,
-    "`[0, 2^64 - 2^32]`",
-    "`[0, 2^64 - 1]`",
-    [u128]
-);
-quotient_map_large_iint!(
-    Goldilocks,
-    i64,
-    "`[-(2^63 - 2^31), 2^63 - 2^31]`",
-    "`[1 + 2^32 - 2^64, 2^64 - 1]`",
-    [(i128, u128)]
-);
-
-impl QuotientMap<u64> for Goldilocks {
-    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
-    ///
-    /// No reduction is needed as the internal value is allowed
-    /// to be any u64.
-    #[inline]
-    fn from_int(int: u64) -> Self {
-        Self::new(int)
-    }
-
-    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
-    ///
-    /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`.
-    #[inline]
-    fn from_canonical_checked(int: u64) -> Option<Self> {
-        (int < Self::ORDER_U64).then(|| Self::new(int))
-    }
-
-    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
-    ///
-    /// # Safety
-    /// In this case this function is actually always safe as the internal
-    /// value is allowed to be any u64.
-    #[inline(always)]
-    unsafe fn from_canonical_unchecked(int: u64) -> Self {
-        Self::new(int)
-    }
-}
-
-impl QuotientMap<i64> for Goldilocks {
-    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
-    ///
-    /// We simply need to deal with the sign.
-    #[inline]
-    fn from_int(int: i64) -> Self {
-        if int >= 0 {
-            Self::new(int as u64)
-        } else {
-            Self::new(Self::ORDER_U64.wrapping_add_signed(int))
-        }
-    }
-
-    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
-    ///
-    /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`.
-    #[inline]
-    fn from_canonical_checked(int: i64) -> Option<Self> {
-        const POS_BOUND: i64 = (P >> 1) as i64;
-        const NEG_BOUND: i64 = -POS_BOUND;
-        match int {
-            0..=POS_BOUND => Some(Self::new(int as u64)),
-            NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
-            _ => None,
-        }
-    }
-
-    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
-    ///
-    /// # Safety
-    /// In this case this function is actually always safe as the internal
-    /// value is allowed to be any u64.
-    #[inline(always)]
-    unsafe fn from_canonical_unchecked(int: i64) -> Self {
-        Self::from_int(int)
-    }
-}
-
-impl PrimeField for Goldilocks {
-    fn as_canonical_biguint(&self) -> BigUint {
-        self.as_canonical_u64().into()
-    }
-}
-
-impl PrimeField64 for Goldilocks {
-    const ORDER_U64: u64 = P;
-
-    #[inline]
-    fn as_canonical_u64(&self) -> u64 {
-        let mut c = self.value;
-        // We only need one condition subtraction, since 2 * ORDER would not fit in a u64.
-        if c >= Self::ORDER_U64 {
-            c -= Self::ORDER_U64;
-        }
-        c
-    }
-}
-
-impl TwoAdicField for Goldilocks {
-    const TWO_ADICITY: usize = 32;
-
-    fn two_adic_generator(bits: usize) -> Self {
-        assert!(bits <= Self::TWO_ADICITY);
-        Self::TWO_ADIC_GENERATORS[bits]
-    }
-}
-
-/// A const version of the addition function.
-///
-/// Useful for constructing constants values in const contexts. Outside of
-/// const contexts, Add should be used instead.
-#[inline]
-const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
-    let (sum, over) = lhs.value.overflowing_add(rhs.value);
-    let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
-    if over {
-        sum += Goldilocks::NEG_ORDER;
-    }
-    Goldilocks::new(sum)
-}
-
-impl Add for Goldilocks {
-    type Output = Self;
-
-    #[inline]
-    fn add(self, rhs: Self) -> Self {
-        let (sum, over) = self.value.overflowing_add(rhs.value);
-        let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
-        if over {
-            // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not
-            // sufficient for double-overflow.
-            // This assume does two things:
-            //  1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip
-            //     this check.
-            //  2. Hints to the compiler how rare this double-overflow is (thus handled better with
-            //     a branch).
-            unsafe {
-                assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
-            }
-            branch_hint();
-            sum += Self::NEG_ORDER; // Cannot overflow.
-        }
-        Self::new(sum)
-    }
-}
-
-impl Sub for Goldilocks {
-    type Output = Self;
-
-    #[inline]
-    fn sub(self, rhs: Self) -> Self {
-        let (diff, under) = self.value.overflowing_sub(rhs.value);
-        let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
-        if under {
-            // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not
-            // sufficient for double-underflow.
-            // This assume does two things:
-            //  1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER,
-            //     then it can skip this check.
-            //  2. Hints to the compiler how rare this double-underflow is (thus handled better
-            //     with a branch).
-            unsafe {
-                assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
-            }
-            branch_hint();
-            diff -= Self::NEG_ORDER; // Cannot underflow.
-        }
-        Self::new(diff)
-    }
-}
-
-impl Neg for Goldilocks {
-    type Output = Self;
-
-    #[inline]
-    fn neg(self) -> Self::Output {
-        Self::new(Self::ORDER_U64 - self.as_canonical_u64())
-    }
-}
-
-impl Mul for Goldilocks {
-    type Output = Self;
-
-    #[inline]
-    fn mul(self, rhs: Self) -> Self {
-        reduce128(u128::from(self.value) * u128::from(rhs.value))
-    }
-}
-
-impl_add_assign!(Goldilocks);
-impl_sub_assign!(Goldilocks);
-impl_mul_methods!(Goldilocks);
-impl_div_methods!(Goldilocks, Goldilocks);
-
-impl Sum for Goldilocks {
-    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
-        // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2.
-
-        // This sum will not overflow so long as iter.len() < 2^64.
-        let sum = iter.map(|x| x.value as u128).sum::<u128>();
-        reduce128(sum)
-    }
-}
-
-/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
-/// field order and `2^64`.
-#[inline]
-pub(crate) fn reduce128(x: u128) -> Goldilocks {
-    let (x_lo, x_hi) = split(x); // This is a no-op
-    let x_hi_hi = x_hi >> 32;
-    let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
-
-    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
-    if borrow {
-        branch_hint(); // A borrow is exceedingly rare. It is faster to branch.
-        t0 -= Goldilocks::NEG_ORDER; // Cannot underflow.
-    }
-    let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
-    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
-    Goldilocks::new(t2)
-}
-
-#[inline]
-#[allow(clippy::cast_possible_truncation)]
-const fn split(x: u128) -> (u64, u64) {
-    (x as u64, (x >> 64) as u64)
-}
-
-/// Fast addition modulo ORDER for x86-64.
-/// This function is marked unsafe for the following reasons:
-///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
-///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
-///     the registers, so its use is not recommended when either input will be used again.
-#[inline(always)]
-#[cfg(target_arch = "x86_64")]
-unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
-    unsafe {
-        let res_wrapped: u64;
-        let adjustment: u64;
-        core::arch::asm!(
-            "add {0}, {1}",
-            // Trick. The carry flag is set iff the addition overflowed.
-            // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
-            // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
-            // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
-            // on overflow; this happens be NEG_ORDER.
-            // Note that the CPU does not realize that the result of sbb x, x does not actually depend
-            // on x. We must write the result to a register that we know to be ready. We have a
-            // dependency on {1} anyway, so let's use it.
-            "sbb {1:e}, {1:e}",
-            inlateout(reg) x => res_wrapped,
-            inlateout(reg) y => adjustment,
-            options(pure, nomem, nostack),
-        );
-        assume(x != 0 || (res_wrapped == y && adjustment == 0));
-        assume(y != 0 || (res_wrapped == x && adjustment == 0));
-        // Add NEG_ORDER == subtract ORDER.
-        // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
-        res_wrapped + adjustment
-    }
-}
-
-#[inline(always)]
-#[cfg(not(target_arch = "x86_64"))]
-unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
-    let (res_wrapped, carry) = x.overflowing_add(y);
-    // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
-    res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
-}
-
-/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm.
-///
-/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf.
-/// The key idea is to compute update factors which are incorrect by a known power of 2 which
-/// can be corrected at the end. These update factors can then be used to construct the inverse
-/// via a simple linear combination.
-///
-/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic.
-fn gcd_inversion(input: Goldilocks) -> Goldilocks {
-    // Initialise our values to the value we want to invert and the prime.
-    let (mut a, mut b) = (input.value, P);
-
-    // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`.
-    // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`.
-    // We split the iterations into 2 rounds of length 63.
-    const ROUND_SIZE: usize = 63;
-
-    // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted
-    // version which doesn't do any computations involving g. But either the compiler works this out
-    // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks.
-    let (f00, _, f10, _) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
-    let (_, _, f11, g11) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
-
-    // The update factors are i64's except we need to interpret -2^63 as 2^63.
-    // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
-    let u = from_unusual_int(f00);
-    let v = from_unusual_int(f10);
-    let u_fac11 = from_unusual_int(f11);
-    let v_fac11 = from_unusual_int(g11);
-
-    // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}.
-    // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66.
-    (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
-}
-
-/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63.
-const fn from_unusual_int(int: i64) -> Goldilocks {
-    if (int >= 0) || (int == i64::MIN) {
-        Goldilocks::new(int as u64)
-    } else {
-        Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field::extension::BinomialExtensionField;
-    use p3_field_testing::{
-        test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field,
-    };
-
-    use super::*;
-
-    type F = Goldilocks;
-    type EF = BinomialExtensionField<F, 5>;
-
-    #[test]
-    fn test_goldilocks() {
-        let f = F::new(100);
-        assert_eq!(f.as_canonical_u64(), 100);
-
-        // Over the Goldilocks field, the following set of equations hold
-        // p               = 0
-        // 2^64 - 2^32 + 1 = 0
-        // 2^64            = 2^32 - 1
-        let f = F::new(u64::MAX);
-        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
-
-        let f = F::from_u64(u64::MAX);
-        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
-
-        // Generator check
-        let expected_multiplicative_group_generator = F::new(7);
-        assert_eq!(F::GENERATOR, expected_multiplicative_group_generator);
-        assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64);
-
-        // Check on `reduce_u128`
-        let x = u128::MAX;
-        let y = reduce128(x);
-        // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1
-        // 2^128 - 1 = (2^64 - 1) * (2^64 + 1)
-        //           = (2^32 - 1 - 1) * (2^32 - 1 + 1)
-        //           = (2^32 - 2) * (2^32)
-        //           = 2^64 - 2 * 2^32
-        //           = 2^64 - 2^33
-        //           = 2^32 - 1 - 2^33
-        //           = - 2^32 - 1
-        let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE;
-        assert_eq!(y, expected_result);
-
-        let f = F::new(100);
-        assert_eq!(f.injective_exp_n().injective_exp_root_n(), f);
-        assert_eq!(y.injective_exp_n().injective_exp_root_n(), y);
-        assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO);
-    }
-
-    // Goldilocks has a redundant representation for both 0 and 1.
-    const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)];
-    const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)];
-
-    // Get the prime factorization of the order of the multiplicative group.
-    // i.e. the prime factorization of P - 1.
-    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] {
-        [
-            (BigUint::from(2u8), 32),
-            (BigUint::from(3u8), 1),
-            (BigUint::from(5u8), 1),
-            (BigUint::from(17u8), 1),
-            (BigUint::from(257u16), 1),
-            (BigUint::from(65537u32), 1),
-        ]
-    }
-
-    test_field!(
-        crate::Goldilocks,
-        &super::ZEROS,
-        &super::ONES,
-        &super::multiplicative_group_prime_factorization()
-    );
-    test_prime_field!(crate::Goldilocks);
-    test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES);
-    test_two_adic_field!(crate::Goldilocks);
-
-    test_field_dft!(
-        radix2dit,
-        crate::Goldilocks,
-        super::EF,
-        p3_dft::Radix2Dit<_>
-    );
-    test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers);
-    test_field_dft!(
-        parallel,
-        crate::Goldilocks,
-        super::EF,
-        p3_dft::Radix2DitParallel<crate::Goldilocks>
-    );
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs
deleted file mode 100644
index 9447fe094..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs
+++ /dev/null
@@ -1,42 +0,0 @@
-//! The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
-
-#![no_std]
-
-extern crate alloc;
-
-mod extension;
-mod goldilocks;
-mod mds;
-mod poseidon2;
-
-pub use goldilocks::*;
-pub use mds::*;
-pub use poseidon2::*;
-
-pub mod poseidon1;
-
-#[cfg(target_arch = "aarch64")]
-mod aarch64_neon;
-
-#[cfg(target_arch = "aarch64")]
-pub use aarch64_neon::*;
-
-#[cfg(all(
-    target_arch = "x86_64",
-    target_feature = "avx2",
-    not(target_feature = "avx512f")
-))]
-mod x86_64_avx2;
-
-#[cfg(all(
-    target_arch = "x86_64",
-    target_feature = "avx2",
-    not(target_feature = "avx512f")
-))]
-pub use x86_64_avx2::*;
-
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-mod x86_64_avx512;
-
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-pub use x86_64_avx512::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs
deleted file mode 100644
index df41485b3..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs
+++ /dev/null
@@ -1,761 +0,0 @@
-//! MDS matrices over the Goldilocks field, and permutations defined by them.
-//!
-//! NB: Not all sizes have fast implementations of their permutations.
-//! Supported sizes: 8, 12, 16, 24, 32, 64, 68.
-//! Sizes 8 and 12 are from Plonky2, size 16 was found as part of concurrent
-//! work by Angus Gruen and Hamish Ivey-Law. Other sizes are from Ulrich Haböck's
-//! database.
-
-use p3_dft::Radix2Bowers;
-use p3_mds::MdsPermutation;
-use p3_mds::karatsuba_convolution::Convolve;
-use p3_mds::util::{apply_circulant, apply_circulant_fft, first_row_to_first_col};
-use p3_symmetric::Permutation;
-
-use crate::{Goldilocks, reduce128};
-
-#[derive(Clone, Debug, Default)]
-pub struct MdsMatrixGoldilocks;
-
-/// Instantiate convolution for "small" RHS vectors over Goldilocks.
-///
-/// Here "small" means N = len(rhs) <= 16 and sum(r for r in rhs) <
-/// 2^51, though in practice the sum will be less than 2^9.
-#[derive(Debug)]
-pub struct SmallConvolveGoldilocks;
-impl Convolve<Goldilocks, i128, i64> for SmallConvolveGoldilocks {
-    const T_ZERO: i128 = 0;
-    const U_ZERO: i64 = 0;
-
-    #[inline(always)]
-    fn halve(val: i128) -> i128 {
-        val >> 1
-    }
-
-    /// Return the lift of a Goldilocks element, 0 <= input.value <= P
-    /// < 2^64. We widen immediately, since some valid Goldilocks elements
-    /// don't fit in an i64, and since in any case overflow can occur
-    /// for even the smallest convolutions.
-    #[inline(always)]
-    fn read(input: Goldilocks) -> i128 {
-        input.value as i128
-    }
-
-    /// For a convolution of size N, |x| < N * 2^64 and (as per the
-    /// assumption above), |y| < 2^51. So the product is at most N *
-    /// 2^115 which will not overflow for N <= 16. We widen `y` at
-    /// this point to perform the multiplication.
-    #[inline(always)]
-    fn parity_dot<const N: usize>(u: [i128; N], v: [i64; N]) -> i128 {
-        let mut s = 0i128;
-        for i in 0..N {
-            s += u[i] * v[i] as i128;
-        }
-        s
-    }
-
-    /// The assumptions above mean z < N^2 * 2^115, which is at most
-    /// 2^123 when N <= 16.
-    ///
-    /// NB: Even though intermediate values could be negative, the
-    /// output must be non-negative since the inputs were
-    /// non-negative.
-    #[inline(always)]
-    fn reduce(z: i128) -> Goldilocks {
-        debug_assert!(z >= 0);
-        reduce128(z as u128)
-    }
-}
-
-const FFT_ALGO: Radix2Bowers = Radix2Bowers;
-
-pub(crate) const MATRIX_CIRC_MDS_8_SML_ROW: [i64; 8] = [7, 1, 3, 8, 8, 3, 4, 9];
-
-/// First column of the circulant MDS matrix for width 8, derived from the first row.
-pub const MATRIX_CIRC_MDS_8_COL: [i64; 8] = first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW);
-
-impl Permutation<[Goldilocks; 8]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [Goldilocks; 8]) -> [Goldilocks; 8] {
-        const MATRIX_CIRC_MDS_8_SML_COL: [i64; 8] =
-            first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW);
-        SmallConvolveGoldilocks::apply(
-            input,
-            MATRIX_CIRC_MDS_8_SML_COL,
-            SmallConvolveGoldilocks::conv8,
-        )
-    }
-}
-impl MdsPermutation<Goldilocks, 8> for MdsMatrixGoldilocks {}
-
-pub(crate) const MATRIX_CIRC_MDS_12_SML_ROW: [i64; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10];
-
-/// First column of the circulant MDS matrix for width 12, derived from the first row.
-pub const MATRIX_CIRC_MDS_12_COL: [i64; 12] = first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW);
-
-impl Permutation<[Goldilocks; 12]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [Goldilocks; 12]) -> [Goldilocks; 12] {
-        const MATRIX_CIRC_MDS_12_SML_COL: [i64; 12] =
-            first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW);
-        SmallConvolveGoldilocks::apply(
-            input,
-            MATRIX_CIRC_MDS_12_SML_COL,
-            SmallConvolveGoldilocks::conv12,
-        )
-    }
-}
-impl MdsPermutation<Goldilocks, 12> for MdsMatrixGoldilocks {}
-
-pub(crate) const MATRIX_CIRC_MDS_16_SML_ROW: [i64; 16] =
-    [1, 1, 51, 1, 11, 17, 2, 1, 101, 63, 15, 2, 67, 22, 13, 3];
-
-impl Permutation<[Goldilocks; 16]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [Goldilocks; 16]) -> [Goldilocks; 16] {
-        const MATRIX_CIRC_MDS_16_SML_COL: [i64; 16] =
-            first_row_to_first_col(&MATRIX_CIRC_MDS_16_SML_ROW);
-        SmallConvolveGoldilocks::apply(
-            input,
-            MATRIX_CIRC_MDS_16_SML_COL,
-            SmallConvolveGoldilocks::conv16,
-        )
-    }
-}
-impl MdsPermutation<Goldilocks, 16> for MdsMatrixGoldilocks {}
-
-#[rustfmt::skip]
-pub(crate) const MATRIX_CIRC_MDS_24_GOLDILOCKS: [u64; 24] = [
-    0x5FFFFFFFA00AAAAB, 0x24021AB75BBFE656, 0x7BE9082D73B06DF5, 0x2282863E9C3A5A62,
-    0xE0071C70DFFC71C8, 0x796CB65AB42A1A63, 0xDBBBBFFADFFDDDE3, 0x23B88EE217C5C9C2,
-    0x20030C309FFB6DB7, 0x23C3C64763BE1E1D, 0x0F93B7C9CC51362E, 0xC697A1094BD0850A,
-    0xDFFFFFFF1FFC71C8, 0xC15A4FD614950302, 0xC41D883A4C4DEDF2, 0x187879BC23C46462,
-    0x5FFCF3CEDFFE79E8, 0x1C41DF105B82398E, 0x64444003DFFDDDDA, 0x76EDDBB6F7E51F95,
-    0x1FF8E38E20038E39, 0x214139BD5C40A09D, 0x3065B7CCF3B3B621, 0x23B6F4622485CEDC,
-];
-
-impl Permutation<[Goldilocks; 24]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [Goldilocks; 24]) -> [Goldilocks; 24] {
-        apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input)
-    }
-}
-impl MdsPermutation<Goldilocks, 24> for MdsMatrixGoldilocks {}
-
-#[rustfmt::skip]
-const MATRIX_CIRC_MDS_32_GOLDILOCKS: [u64; 32] = [
-    0x0800000000000000, 0x69249248B4924925, 0x3ABD5EAF15EAF57B, 0x294A5294739CE73A,
-    0x59E2D2CEB4B3C5A6, 0x087FBE00FF7C0220, 0xA554AA94A554AA96, 0xF00080FEFFDF8005,
-    0x64CCCCCC6666699A, 0x5B13AD8973B139D9, 0xAD4A55ACA54AD5AA, 0xDA496DA3B492DB8A,
-    0x4AD696955A5694B5, 0xA4A6B29A25B496D3, 0xA74EA162162BD3A9, 0xC698B3A5662CE98C,
-    0xA7FFFFFF55555556, 0x4AAAAAAA5AAAAAAB, 0xB047DC113DC11F71, 0x8BA2E8B99B26C9B3,
-    0xD259696C5A5B4D2E, 0xA7D540AA557EA9F6, 0x8B6E922D26DB249C, 0xFAAA805455602AAD,
-    0xCB33333266666334, 0xD13B17619B13B277, 0x45B26D9326E9374A, 0x52AB552A5AA9556B,
-    0x68ED2D2DB4B87697, 0x8B264C98A74E9D3B, 0x09EC23D83D847B09, 0x2C9A4D26669349A5,
-];
-
-impl Permutation<[Goldilocks; 32]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [Goldilocks; 32]) -> [Goldilocks; 32] {
-        const ENTRIES: [u64; 32] = first_row_to_first_col(&MATRIX_CIRC_MDS_32_GOLDILOCKS);
-        apply_circulant_fft(&FFT_ALGO, ENTRIES, &input)
-    }
-}
-impl MdsPermutation<Goldilocks, 32> for MdsMatrixGoldilocks {}
-
-#[rustfmt::skip]
-const MATRIX_CIRC_MDS_64_GOLDILOCKS: [u64; 64] = [
-    0x07FFFFFFFC000000, 0xFBFFFFFF04000001, 0x436DB6DB25B6DB6E, 0x4AAAAAAA5AAAAAAB,
-    0x45B2D96C6D96CB66, 0x3BC7BC7B87BC7BC8, 0x6318C63125294A53, 0xCB3672CCCD9CB368,
-    0xB43CB5A12D68796C, 0xFBFBFBFAFBFBFBFD, 0x883DBF107B7E2210, 0x8A7689B59B629DA3,
-    0xF7FEFFDF00000001, 0x7B7C83BBC83BC47C, 0xEFF0410107EF7F83, 0x2CD8B3629CB272CA,
-    0x9800019900CCCE67, 0xFBFFFBFF07FFFC01, 0x94EC4A758C4EC628, 0xDA5A5B4A6D2D2E1F,
-    0xFFEFC080FC003FFF, 0xBC387BC2C783BC79, 0xB492DB686D24B6F3, 0x1DB6925B4B6E2477,
-    0x7801E0EF87BFFF10, 0xFC0803FAFBFC0409, 0x3780FE03C086F21C, 0x8B749B224DB22D94,
-    0x32648B36B76E9923, 0x3BC3C3C387C3C3C4, 0x79AF286B4FCA1AF3, 0x9E2762758B627628,
-    0x52AAAAAA56AAAAAB, 0xFBFFFFFEFC000001, 0xF7FFFFFF08000001, 0x2CCCCCCC9CCCCCCD,
-    0xCF286BC946BCA1B0, 0xBC483B7B883B7C49, 0xD9364D9287C1F07D, 0xAD5A94A8A95AD5AA,
-    0xFF871002C400F1E1, 0xFC03FC02FC03FC05, 0xD29495A4D6D4B4A6, 0x6C926DD1DD24DB65,
-    0x1EDC247B4DB64937, 0x7C7B843B47BC437D, 0xA55A95AAAD5AD52C, 0x4A96D5A45AD694A6,
-    0xFE6664CBCD999801, 0xFC0003FF08000401, 0x1EC4F09D64EC4D8A, 0x9E1E1D2C8B4B4A5B,
-    0xD9270937709B64DC, 0x3BB77C4448843B78, 0xFFFFFFDF03FF0021, 0x59D8761D2D8A6299,
-    0xC3496878A5E5A4B5, 0xFBF80402FC0403F9, 0x5ECD9B360E142851, 0x6D925D6429D64976,
-    0xA8AE615C19CC2B99, 0xBC44444388444445, 0xDFE3F1F81CFC7E40, 0xDA4924916D24924A,
-];
-
-impl Permutation<[Goldilocks; 64]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [Goldilocks; 64]) -> [Goldilocks; 64] {
-        const ENTRIES: [u64; 64] = first_row_to_first_col(&MATRIX_CIRC_MDS_64_GOLDILOCKS);
-        apply_circulant_fft(&FFT_ALGO, ENTRIES, &input)
-    }
-}
-impl MdsPermutation<Goldilocks, 64> for MdsMatrixGoldilocks {}
-
-#[rustfmt::skip]
-const MATRIX_CIRC_MDS_68_GOLDILOCKS: [u64; 68] = [
-    0x03C3C3C3FC3C3C3C, 0x6799AFC54A69BC7D, 0xDA8C2C496A74B03B, 0x1E641D7AB35ED229,
-    0x9239DA20DA3A2686, 0x6E23D41459EBA8C4, 0x7BC412896E2A6B3A, 0x9082059089ABD4FC,
-    0x94A16FA8B0339EEE, 0x85650EC91BB519C9, 0x1600745267E94DE1, 0xFFFD8405C82020AB,
-    0x21BDE80429DCED6A, 0x8ACE123AF754E343, 0xFFC7211605D2BDAE, 0xC21187AE15900F4D,
-    0x9C4A889708568DC6, 0x65A5A726B5758D8E, 0x949DB90B9AC0D11A, 0x23B6CF7C368BBE52,
-    0xD5128DDF59CB5A35, 0xF53BCC5BDADF3A0A, 0xBA7C5112F4BAB1CD, 0x4B93989C5B729351,
-    0x6534B7E50E4AD1CB, 0x640061B54C918405, 0x0E66E1F90D2C9311, 0x31C8649B0FE7557F,
-    0x0E9190D165F4A8F3, 0x52DF336BB708F919, 0x3C0F6697F14065A5, 0xBE8190942EC50031,
-    0x60038E9ACC701118, 0x73F105909A55A88B, 0xFEBEBEBDABEBEBED, 0x6F52163A64B03467,
-    0xFBAE131F23A12F56, 0x1950493BC70D0676, 0x2886550DB5A1BBBF, 0x15B003D6E58181D7,
-    0x3A4E7D9D44F100F8, 0x6CC3AB896025E6A0, 0x7E23E68456F825E5, 0x079CDD570B591A16,
-    0xEC15A830C3D2CCD1, 0xCF4C722D2C0F8A0E, 0xC1BB6F5591B59A26, 0xB63A5931A607BDE0,
-    0x43A0AD0B71040187, 0x7E4B492889D1CEE0, 0x734153F3F0C31C5B, 0x98D8D756B2725A5B,
-    0x5589D20D74BA00B8, 0xB2DF58DF0A312509, 0xFABC378690D64A3A, 0x700640AFC244B695,
-    0xFFA652236547F3BE, 0x2B9CA498A001D059, 0x7DACA6F16787D5DE, 0xAAAD774FAC613EA3,
-    0xA88583816975CD56, 0x78B71DC516FF49CA, 0xC7BF095DF702FFA6, 0x78A60B3F971783B3,
-    0xCB158EF40BC75CAC, 0xA97E818DBC152B4C, 0x9FC8339D415C3999, 0x006A88C0A0D8201C,
-];
-
-impl Permutation<[Goldilocks; 68]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [Goldilocks; 68]) -> [Goldilocks; 68] {
-        apply_circulant(&MATRIX_CIRC_MDS_68_GOLDILOCKS, &input)
-    }
-}
-impl MdsPermutation<Goldilocks, 68> for MdsMatrixGoldilocks {}
-
-#[cfg(test)]
-mod tests {
-    use p3_symmetric::Permutation;
-
-    use super::{Goldilocks, MdsMatrixGoldilocks};
-
-    #[test]
-    fn goldilocks8() {
-        let input: [Goldilocks; 8] = Goldilocks::new_array([
-            2434589605738284713,
-            4817685620989478889,
-            13397079175138649456,
-            11944520631108649751,
-            1033251468644039632,
-            3092099742268329866,
-            7160548811622790454,
-            9959569614427134344,
-        ]);
-
-        let output = MdsMatrixGoldilocks.permute(input);
-
-        let expected: [Goldilocks; 8] = Goldilocks::new_array([
-            16726687146516531007,
-            14721040752765534861,
-            15566838577475948790,
-            9095485010737904250,
-            11353934351835864222,
-            11056556168691087893,
-            4199602889124860181,
-            315643510993921470,
-        ]);
-
-        assert_eq!(output, expected);
-    }
-
-    #[test]
-    fn goldilocks12() {
-        let input: [Goldilocks; 12] = Goldilocks::new_array([
-            14847187883725400244,
-            969392934980971521,
-            6996647758016470432,
-            4674844440624672154,
-            264841656685969785,
-            1246852265697711623,
-            18223868478428473484,
-            12122736699239070772,
-            11263701854732819430,
-            12739925508864285577,
-            11648637570857932167,
-            14090978315217600393,
-        ]);
-
-        let output = MdsMatrixGoldilocks.permute(input);
-
-        let expected: [Goldilocks; 12] = Goldilocks::new_array([
-            9322351889214742299,
-            8700136572060418355,
-            4881757876459003977,
-            9899544690241851021,
-            480548822895830465,
-            5445915149371405525,
-            14955363277757168581,
-            6672733082273363313,
-            190938676320003294,
-            1613225933948270736,
-            3549006224849989171,
-            12169032187873197425,
-        ]);
-
-        assert_eq!(output, expected);
-    }
-
-    #[test]
-    fn goldilocks16() {
-        let input: [Goldilocks; 16] = Goldilocks::new_array([
-            13216135600341032847,
-            15626390207663319651,
-            2052474569300149934,
-            4375663431730581786,
-            16596827905941257435,
-            10019626608444427271,
-            7831946179065963230,
-            17104499871144693506,
-            9021930732511690478,
-            6899419210615882449,
-            8131182521761419514,
-            432489675596019804,
-            8508050013409958723,
-            14134506582804571789,
-            13283546413390931641,
-            14711125975653831032,
-        ]);
-
-        let output = MdsMatrixGoldilocks.permute(input);
-
-        let expected: [Goldilocks; 16] = Goldilocks::new_array([
-            9484392671298797780,
-            149770626972189150,
-            12125722600598304117,
-            15945232149672903756,
-            13199929870021500593,
-            18443980893262804946,
-            317150800081307627,
-            16910019239751125049,
-            1996802739033818490,
-            11668458913264624237,
-            11078800762167869397,
-            13758408662406282356,
-            11119677412113674380,
-            7344117715971661026,
-            4202436890275702092,
-            681166793519210465,
-        ]);
-
-        assert_eq!(output, expected);
-    }
-
-    #[test]
-    fn goldilocks24() {
-        let input: [Goldilocks; 24] = Goldilocks::new_array([
-            11426771245122339662,
-            5975488243963332229,
-            11441424994503305651,
-            5755561333702259678,
-            7295454168648181339,
-            16724279929816174064,
-            32359231037136391,
-            3713621595270370753,
-            8421765959140936778,
-            12370571593326246544,
-            8633733294559731287,
-            12765436832373161027,
-            15606692828890413034,
-            8068160018166226874,
-            10719661629577139538,
-            13036735610140127982,
-            10213543772818211674,
-            8041886705706266368,
-            12022983417703446028,
-            4179370708601587579,
-            11125302089484330465,
-            9904943018174649533,
-            16178194376951442671,
-            1545799842160818502,
-        ]);
-
-        let output = MdsMatrixGoldilocks.permute(input);
-
-        let expected: [Goldilocks; 24] = Goldilocks::new_array([
-            18431075688485197060,
-            14823984346528185622,
-            7262979358411339215,
-            14816911393874702213,
-            6721523710303409972,
-            10829861327716364029,
-            2456948878733883601,
-            11088379938350287658,
-            3820735023521527858,
-            9062288923770492958,
-            5159244568306327366,
-            1401669669887165869,
-            11908734248351870182,
-            10640195377186320543,
-            6552733980894593378,
-            17103376282032495459,
-            5204287788603805758,
-            17783185518697631139,
-            9006863878586007300,
-            11122535637762904803,
-            5271621316102699962,
-            9734499541452484536,
-            11778274360927642637,
-            3217831681350496533,
-        ]);
-
-        assert_eq!(output, expected);
-    }
-
-    #[test]
-    fn goldilocks32() {
-        let input: [Goldilocks; 32] = Goldilocks::new_array([
-            8401806579759049284,
-            14709608922272986544,
-            8130995604641968478,
-            7833133203357642391,
-            10700492548100684406,
-            3941105252506602047,
-            8122370916776133262,
-            15079919378435648206,
-            8774521769784086994,
-            16794844316583392853,
-            9356562741425567167,
-            13317198313361936216,
-            7187680218428599522,
-            16525662096158660997,
-            540453741156061014,
-            16543585577270698663,
-            3802215918136285729,
-            11389297895303247764,
-            5133769394766075512,
-            1057795099426170863,
-            18037861421172314665,
-            17632255188776359310,
-            17616515088477043142,
-            13307921676744533876,
-            17602277262015191215,
-            15819040654617566738,
-            11961318546000835928,
-            15593174310433874065,
-            9152657050882549004,
-            4801868480369948110,
-            13202076339494141066,
-            726396847460932316,
-        ]);
-
-        let output = MdsMatrixGoldilocks.permute(input);
-
-        let expected: [Goldilocks; 32] = Goldilocks::new_array([
-            1179701925859507209,
-            5543239597787055637,
-            5978278622530964070,
-            3622388166841103287,
-            11383243182536830899,
-            14719109850604985734,
-            17672601866826623850,
-            4879627080283827596,
-            7556887460241466109,
-            9548493506061808122,
-            13980851986825291174,
-            2029844508485082398,
-            10375517623784134775,
-            13067093881736606569,
-            6446569064196467795,
-            15375603814779462714,
-            11307946648742033371,
-            1593906954637160608,
-            5776169226282316678,
-            8167048017892669861,
-            3954052226208277367,
-            9346878497567392707,
-            5570872870988220142,
-            10792661164389799960,
-            17494962593174487938,
-            7080549557843445752,
-            14059834522311268132,
-            17747288366997773235,
-            17158122400620315305,
-            6816598002359267850,
-            12363049840026116993,
-            13313901185845854868,
-        ]);
-
-        assert_eq!(output, expected);
-    }
-
-    #[test]
-    fn goldilocks64() {
-        let input: [Goldilocks; 64] = Goldilocks::new_array([
-            3471075506106776899,
-            4817046918282259009,
-            3480368692354016145,
-            18110937755057600106,
-            3130862083451221140,
-            15376650156021437015,
-            7997596749112997445,
-            7742916918728590149,
-            421644639408377358,
-            2491271421424548020,
-            1940196613872160755,
-            7152053147988203177,
-            13697425352450853423,
-            15877844788345672674,
-            17787098720906653510,
-            6857627524724866519,
-            8541180216786820396,
-            10769715704553877654,
-            9265712399189924160,
-            10220120296438955872,
-            18201417281995610945,
-            6749698931189855822,
-            13700000989116811950,
-            13205437213697578097,
-            10514342943989454609,
-            9926015350795325725,
-            2289808224483690257,
-            12598806357998460973,
-            14393945610969324307,
-            4744625557965362093,
-            2270701163031951561,
-            2927942398784334090,
-            5250916386894733430,
-            4030189910566345872,
-            4953663590324639075,
-            1241519685782896035,
-            8681312160951359069,
-            8236353015475387411,
-            4972690458759871996,
-            1396852754187463352,
-            17512022752774329733,
-            14009268822557836700,
-            1346736409027879377,
-            7609463340861239931,
-            10701512803758419515,
-            5067199073587389986,
-            5030018986055211116,
-            17692625804700013551,
-            9992938630604785132,
-            15350127009762647067,
-            10247405821493235386,
-            15172888833500531069,
-            14657693742399622179,
-            7391511805216089127,
-            2035742693690795598,
-            4047216012963057952,
-            12602085105939403203,
-            16985723692990258059,
-            12141021186082151434,
-            3174646196626212833,
-            16484520987666295947,
-            10579720164460442970,
-            9596917135039689219,
-            13761818390665814258,
-        ]);
-
-        let output = MdsMatrixGoldilocks.permute(input);
-
-        let expected: [Goldilocks; 64] = Goldilocks::new_array([
-            9158798369861934356,
-            9224859686427886689,
-            16948559910286211274,
-            15765762765140902574,
-            16202509467561200764,
-            1911749439284071529,
-            4607026757869726805,
-            8473827004973131317,
-            13716800466551879373,
-            6670177022201597800,
-            17416833238376299449,
-            14953676562252669578,
-            5828107070718286209,
-            17980287408679531241,
-            2220583438808757820,
-            14564318040622847100,
-            3950519594558514416,
-            12164610170526828198,
-            457385640833960098,
-            14068973922383216628,
-            9614382247226943793,
-            3932756878771319222,
-            12728498054939249570,
-            9435109056498897661,
-            7283114805836756402,
-            1720178259138435097,
-            11496602000538177285,
-            7736206812858942065,
-            14289784438950643645,
-            12052665489155550962,
-            12918409840610303255,
-            5224324424989208352,
-            7826309014606327907,
-            11657314889847733528,
-            13899641072303006348,
-            7501780959676548477,
-            1064261716045449147,
-            1487682458939665452,
-            10894217148983862136,
-            12785338167343566981,
-            8043323074629160032,
-            10852328074701301213,
-            15029722608724150267,
-            2611937278660861263,
-            13995790409949796943,
-            7103138700054564899,
-            12756778219044204581,
-            4147399997707606088,
-            11930966590061754579,
-            16708700985380478903,
-            2370160521342035603,
-            14893791582608133454,
-            15313288276425450946,
-            16224601303711716386,
-            4488931442519177087,
-            7443169181907410918,
-            12381442753785370161,
-            16366345507676500076,
-            8097905256807642731,
-            8504207502183388457,
-            11400931328719780407,
-            10879211614969476303,
-            7265889003783205111,
-            7322738272300165489,
-        ]);
-
-        assert_eq!(output, expected);
-    }
-
-    #[test]
-    fn goldilocks68() {
-        let input: [Goldilocks; 68] = Goldilocks::new_array([
-            16450563043143968653,
-            3688080826640678185,
-            133253417037384537,
-            17501558583799613353,
-            14920674569425704293,
-            5030578721963251055,
-            9795600398273758687,
-            402012644192671817,
-            10657312189068414445,
-            9508835336085746575,
-            16081669758721272608,
-            2072823794278273547,
-            16831381326702573736,
-            11381683312293543190,
-            5679539322738625588,
-            9346499485038639332,
-            15554202803455984983,
-            18373955571490331663,
-            11323895584334729789,
-            16834542679468148445,
-            14751528164286075953,
-            3755158780970327991,
-            12622814707645103582,
-            10329238611694882547,
-            7642766530280843057,
-            4876120096290984742,
-            412912224820604426,
-            9118233770240274553,
-            3626520971021993076,
-            10841049054903806738,
-            18205546599950141835,
-            7198482606375262809,
-            17183313930831625294,
-            10181033256431249241,
-            1061211413812819905,
-            3980261141891682525,
-            5674176959446948353,
-            6062696542969845681,
-            3383081006315025715,
-            8812665902421024067,
-            3093645099818246186,
-            16178737149039707082,
-            8204245222345541411,
-            11072582337937050490,
-            17969785901925882398,
-            4670890092981706609,
-            12537558683977529426,
-            12084598516323376868,
-            16293685096019175644,
-            10117612240421467846,
-            17873102395739074620,
-            11220493906741851877,
-            4632957003022201019,
-            12934229307704669322,
-            2152792796882257594,
-            12521131928134126701,
-            17472006670677761650,
-            4560570065837283016,
-            6315543803073912887,
-            4098689719955359793,
-            1784883877365258237,
-            6837590090927294950,
-            2391417016765166652,
-            16389291664603960875,
-            12285946887702044436,
-            7231705445010258971,
-            12976071926225281356,
-            8829402645443096358,
-        ]);
-
-        let output = MdsMatrixGoldilocks.permute(input);
-
-        let expected: [Goldilocks; 68] = Goldilocks::new_array([
-            4984914285749049383,
-            10397959071664799177,
-            3331616814639908945,
-            4252459885611162121,
-            5517786723806029201,
-            1826620401370703815,
-            8257849352373689773,
-            1722805960790112693,
-            17654983138917187833,
-            7542660006721409612,
-            1970182718241277021,
-            12865815507550811641,
-            17507096607056552658,
-            7988714902687660369,
-            150082662759625574,
-            17329095993317360383,
-            965880604543562997,
-            2820931239306841741,
-            1980667983336380501,
-            3781794112174728826,
-            7323192150179872391,
-            12243426826276589932,
-            315076483410634889,
-            3221894784246078707,
-            3515955216509190252,
-            964376148920419876,
-            7679719864273407732,
-            2516714701741920303,
-            4837221266652621366,
-            15301563603415983061,
-            10380321314559647625,
-            3023678426639670063,
-            12020917879204725519,
-            10595808165609787680,
-            14199186729378048831,
-            4520610719509879248,
-            9983949546821718635,
-            5066092593424854949,
-            13843503196305181790,
-            14296362815835302652,
-            6766348697864530153,
-            13804582129741554661,
-            8032169955336281598,
-            5198513488794721460,
-            10613667919514788349,
-            7948289550930596506,
-            14118391408956101449,
-            4356952068887595371,
-            709878153008378134,
-            17168579964784489802,
-            17840495726541494819,
-            2710471020841761312,
-            9950159372116756450,
-            3909574932971200058,
-            2430964021804554670,
-            6035162446515244642,
-            14656543530572478095,
-            1539013407173403800,
-            4150113154618904744,
-            4904646199269229662,
-            17257014030727492672,
-            3791823431764085889,
-            13680668409434600948,
-            12367427987617118934,
-            12462908457168650050,
-            10891613749697412017,
-            6867760775372053830,
-            12474954319307005079,
-        ]);
-
-        assert_eq!(output, expected);
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs
deleted file mode 100644
index 89da79e45..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs
+++ /dev/null
@@ -1,1143 +0,0 @@
-//! Poseidon1 permutation for Goldilocks.
-//!
-//! # Overview
-//!
-//! This module provides the Poseidon1 hash permutation instantiated for the
-//! Goldilocks field (p = 2^64 - 2^32 + 1). The public API is a single type
-//! alias that transparently dispatches to the best available implementation.
-//!
-//! # Platform Dispatch
-//!
-//! On **aarch64**, the type alias resolves to a dual-dispatch wrapper:
-//! scalar permutations use NEON-accelerated MDS for full rounds with
-//! LLVM-optimized sparse partial rounds, while packed NEON permutations
-//! use the fused dual-lane ASM path (w8) or per-lane scalar path (w12).
-//!
-//! On **all other platforms**, it resolves to the generic Poseidon1
-//! implementation with Karatsuba MDS convolution.
-//!
-//! No `#[cfg]` is needed in calling code.
-//!
-//! # MDS Matrix
-//!
-//! The MDS matrix is a **circulant** matrix sourced from the MDS crate.
-//! At runtime, it is applied via fast Karatsuba convolution (sub-O(t^2)).
-//! During initialization only, it is expanded to dense form for the
-//! sparse matrix decomposition of partial rounds.
-//!
-//! # Round Constants
-//!
-//! Generated by the Grain LFSR (Poseidon1 paper, Appendix E) with SBOX=0 (x^alpha encoding).
-
-use p3_poseidon1::{
-    Poseidon1, Poseidon1Constants, Poseidon1ExternalLayerGeneric, Poseidon1InternalLayerGeneric,
-};
-
-use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL};
-use crate::{Goldilocks, MdsMatrixGoldilocks};
-
-/// S-box degree for Goldilocks Poseidon1.
-///
-/// The S-box raises each element to this power. The Goldilocks prime
-/// factors as `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. Neither 3 nor 5
-/// are coprime to `p - 1`, so the smallest valid exponent is 7.
-pub const GOLDILOCKS_S_BOX_DEGREE: u64 = 7;
-
-/// Number of full rounds per half for Goldilocks Poseidon (`RF / 2`).
-///
-/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending).
-/// Follows the Poseidon paper's security analysis (Section 5.4) with a +2 RF margin.
-pub const GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS: usize = 4;
-
-/// Number of partial rounds for Goldilocks Poseidon (width 8).
-///
-/// Derived from the interpolation bound in the Poseidon paper (Eq. 3):
-///
-///   R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5
-///            = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20
-///
-/// With the +7.5% security margin (Section 5.4): ⌈1.075 × 20⌉ = 22.
-pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8: usize = 22;
-
-/// Number of partial rounds for Goldilocks Poseidon (width 12).
-///
-/// Same interpolation bound as width 8:
-///
-///   R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20
-///
-/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
-pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12: usize = 22;
-
-/// Generic (non-fused) Poseidon1 permutation for Goldilocks.
-///
-/// Uses the platform-independent Poseidon1 implementation with Karatsuba
-/// MDS convolution. Used directly for widths not supported by the fused
-/// type (e.g. 16, 24) and as the non-aarch64 fallback for widths 8 and 12.
-pub type Poseidon1GoldilocksGeneric<const WIDTH: usize> = Poseidon1<
-    Goldilocks,
-    Poseidon1ExternalLayerGeneric<Goldilocks, MdsMatrixGoldilocks, WIDTH>,
-    Poseidon1InternalLayerGeneric<Goldilocks, WIDTH>,
-    WIDTH,
-    GOLDILOCKS_S_BOX_DEGREE,
->;
-
-/// Unified Poseidon1 permutation for Goldilocks.
-///
-/// On aarch64, resolves to a dual-dispatch wrapper: scalar permutations
-/// use NEON MDS for full rounds with sparse partial rounds, packed NEON
-/// permutations use fused dual-lane ASM (w8) or per-lane scalar (w12).
-///
-/// On all other platforms, resolves to the generic implementation with
-/// Karatsuba MDS convolution.
-///
-/// Supports both scalar and packed state representations transparently.
-#[cfg(target_arch = "aarch64")]
-pub type Poseidon1Goldilocks<const WIDTH: usize> = crate::Poseidon1GoldilocksDispatch<WIDTH>;
-
-/// Unified Poseidon1 permutation for Goldilocks.
-///
-/// On aarch64, resolves to the fused ASM-optimized implementation that
-/// uses inline assembly and dual-lane NEON processing.
-///
-/// On all other platforms, resolves to the generic implementation with
-/// Karatsuba MDS convolution.
-///
-/// Supports both scalar and packed state representations transparently.
-#[cfg(not(target_arch = "aarch64"))]
-pub type Poseidon1Goldilocks<const WIDTH: usize> = Poseidon1GoldilocksGeneric<WIDTH>;
-
-/// Round constants for width-8 Poseidon1 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
-///
-/// Generated by `poseidon/generate_constants.py --field goldilocks --width 8`.
-///
-/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)].
-pub const GOLDILOCKS_POSEIDON1_RC_8: [[Goldilocks; 8]; 30] = Goldilocks::new_2d_array([
-    // Initial full rounds (4)
-    [
-        0xdd5743e7f2a5a5d9,
-        0xcb3a864e58ada44b,
-        0xffa2449ed32f8cdc,
-        0x42025f65d6bd13ee,
-        0x7889175e25506323,
-        0x34b98bb03d24b737,
-        0xbdcc535ecc4faa2a,
-        0x5b20ad869fc0d033,
-    ],
-    [
-        0xf1dda5b9259dfcb4,
-        0x27515210be112d59,
-        0x4227d1718c766c3f,
-        0x26d333161a5bd794,
-        0x49b938957bf4b026,
-        0x4a56b5938b213669,
-        0x1120426b48c8353d,
-        0x6b323c3f10a56cad,
-    ],
-    [
-        0xce57d6245ddca6b2,
-        0xb1fc8d402bba1eb1,
-        0xb5c5096ca959bd04,
-        0x6db55cd306d31f7f,
-        0xc49d293a81cb9641,
-        0x1ce55a4fe979719f,
-        0xa92e60a9d178a4d1,
-        0x002cc64973bcfd8c,
-    ],
-    [
-        0xcea721cce82fb11b,
-        0xe5b55eb8098ece81,
-        0x4e30525c6f1ddd66,
-        0x43c6702827070987,
-        0xaca68430a7b5762a,
-        0x3674238634df9c93,
-        0x88cee1c825e33433,
-        0xde99ae8d74b57176,
-    ],
-    // Partial rounds (22)
-    [
-        0x488897d85ff51f56,
-        0x1140737ccb162218,
-        0xa7eeb9215866ed35,
-        0x9bd2976fee49fcc9,
-        0xc0c8f0de580a3fcc,
-        0x4fb2dae6ee8fc793,
-        0x343a89f35f37395b,
-        0x223b525a77ca72c8,
-    ],
-    [
-        0x56ccb62574aaa918,
-        0xc4d507d8027af9ed,
-        0xa080673cf0b7e95c,
-        0xf0184884eb70dcf8,
-        0x044f10b0cb3d5c69,
-        0xe9e3f7993938f186,
-        0x1b761c80e772f459,
-        0x606cec607a1b5fac,
-    ],
-    [
-        0x14a0c2e1d45f03cd,
-        0x4eace8855398574f,
-        0xf905ca7103eff3e6,
-        0xf8c8f8d20862c059,
-        0xb524fe8bdd678e5a,
-        0xfbb7865901a1ec41,
-        0x014ef1197d341346,
-        0x9725e20825d07394,
-    ],
-    [
-        0xfdb25aef2c5bae3b,
-        0xbe5402dc598c971e,
-        0x93a5711f04cdca3d,
-        0xc45a9a5b2f8fb97b,
-        0xfe8946a924933545,
-        0x2af997a27369091c,
-        0xaa62c88e0b294011,
-        0x058eb9d810ce9f74,
-    ],
-    [
-        0xb3cb23eced349ae4,
-        0xa3648177a77b4a84,
-        0x43153d905992d95d,
-        0xf4e2a97cda44aa4b,
-        0x5baa2702b908682f,
-        0x082923bdf4f750d1,
-        0x98ae09a325893803,
-        0xf8a6475077968838,
-    ],
-    [
-        0xceb0735bf00b2c5f,
-        0x0a1a5d953888e072,
-        0x2fcb190489f94475,
-        0xb5be06270dec69fc,
-        0x739cb934b09acf8b,
-        0x537750b75ec7f25b,
-        0xe9dd318bae1f3961,
-        0xf7462137299efe1a,
-    ],
-    [
-        0xb1f6b8eee9adb940,
-        0xbdebcc8a809dfe6b,
-        0x40fc1f791b178113,
-        0x3ac1c3362d014864,
-        0x9a016184bdb8aeba,
-        0x95f2394459fbc25e,
-        0xe3f34a07a76a66c2,
-        0x8df25f9ad98b1b96,
-    ],
-    [
-        0x85ffc27171439d9d,
-        0xddcb9a2dcfd26910,
-        0x26b5ba4bf3afb94e,
-        0xffff9cc7c7651e2f,
-        0x8c88364698280b55,
-        0xebc114167b910501,
-        0x2d77b4d89ecfb516,
-        0x332e0828eba151f2,
-    ],
-    [
-        0x46fa6a6450dd4735,
-        0xd00db7dd92384a33,
-        0x5fd4fb751f3a5fc5,
-        0x496fb90c0bb65ea2,
-        0xf3baec0bb87cc5c7,
-        0x862a3c0a7d4c7713,
-        0xbf5f38336a3f47d8,
-        0x41ad9dbc1394a20c,
-    ],
-    [
-        0xcc535945b7dbf0f7,
-        0x82af2bc93685bcec,
-        0x8e4c8d0c8cebfccd,
-        0x17cb39417e84597e,
-        0xd4a965a8c749b232,
-        0xa2cab040f33f3ee5,
-        0xa98811a1fed4e3a6,
-        0x1cc48b54f377e2a1,
-    ],
-    [
-        0xe40cd4f6c5609a27,
-        0x11de79ebca97a4a4,
-        0x9177c73d8b7e929d,
-        0x2a6fe8085797e792,
-        0x3de6e93329f8d5ae,
-        0x3f7af9125da962ff,
-        0xd710682cfc77d3ac,
-        0x48faf05f3b053cf4,
-    ],
-    [
-        0x287db8630da89c8b,
-        0x4d0de32053cb30e9,
-        0x8b37a4f20c5ada7b,
-        0xe7cc6ebe78c84ecf,
-        0x240bdc0a66a2610d,
-        0x8299e7f02caa1650,
-        0x380a53fefb6e754e,
-        0x684a1d8cf8eb6810,
-    ],
-    [
-        0xe839452eb4b8a5e1,
-        0xb03fa62e90626af4,
-        0x11a688602fbc5efc,
-        0x30dda75c355a2d62,
-        0x0f712adcb73810de,
-        0xffdc1102187f1ae1,
-        0x40c34f398254b99c,
-        0xede021b9dc289a4a,
-    ],
-    [
-        0x8b7b05225c4e7dad,
-        0x3bc794346f9d9ff9,
-        0xfccb5a57f2ca86ff,
-        0xbb1502015a7da9d4,
-        0xd7e0a35d4352a015,
-        0x27af7a44f8160931,
-        0xc37442f6782f4615,
-        0xbdf392a9bd095dcb,
-    ],
-    [
-        0xc17f55037cf00de9,
-        0xbcffedd34c71a874,
-        0x5eb45d2a8133d1f2,
-        0xbabe251e1612ebdf,
-        0x3efeb9fbe438c536,
-        0x2d7cef97b4afe1cf,
-        0xe5de1b4660016c0b,
-        0xcdcc26c332f5657c,
-    ],
-    [
-        0xe01dd653daf15809,
-        0xb0a6bdd4b41094b5,
-        0x27eac858b0b03a05,
-        0x51d43b5e93adbdc0,
-        0x8b89a23b0fea5fc9,
-        0xdc8ac3b14f7f2fc1,
-        0xe793f82f1efec039,
-        0x9f6f2cf8969e7b80,
-    ],
-    [
-        0x49d45382e0f21d4a,
-        0x5f4ad1797cd72786,
-        0x4dc3dbebfd45f795,
-        0x03a3ef84dba6e1bc,
-        0x204bc9b3d3fc4c01,
-        0x9ad706081e89b9ba,
-        0x638bfb4d840e9f89,
-        0x5ef2938cd095ae35,
-    ],
-    [
-        0x42cca18ebeb265c8,
-        0xb7b2ec5c29aecbf8,
-        0x0d84f9535dc78f0f,
-        0x04e64ad942e77b8c,
-        0xb4880dffffc9da0b,
-        0x16db16d9c29adeb1,
-        0x09bbaf2a0590cd1e,
-        0x76460e74961fcf8d,
-    ],
-    [
-        0xed12a2276dfa1553,
-        0x0b5acec5de0436fd,
-        0x3c6cfea033a1f0a8,
-        0x2b5ecefe546cac15,
-        0x6e2d82884cd3bf6f,
-        0xc134878d1add7b83,
-        0x997963422eb7a280,
-        0x5e834537ac648cf6,
-    ],
-    [
-        0x89e779214737c0b7,
-        0x1a8c05e8581ad95b,
-        0x8d18b72796437cf7,
-        0xe7252c949e04b106,
-        0x53267c4fd174585a,
-        0xa16ef5d9c81dad47,
-        0xda65191937270a46,
-        0xcb2a5b55f2df664c,
-    ],
-    [
-        0x854aee2dc1924137,
-        0xf37013c9d479ece6,
-        0x0e163bc0630c4696,
-        0x384ee64955048f76,
-        0xf65d814e28ee4ec5,
-        0xe57bc564fd82f1b1,
-        0x4b338937b6876614,
-        0x66ee0b04ed43cd8d,
-    ],
-    [
-        0x49884bf25f4ef15d,
-        0xeb51fe28de1c6f54,
-        0x2cd64e84fce8dfcc,
-        0x29164a96a541a013,
-        0x173ce7558f4cacb8,
-        0xeb5b1ce5877c89e9,
-        0x5faff4b0f5217bf6,
-        0xac42d0b1c20f205e,
-    ],
-    // Terminal full rounds (4)
-    [
-        0xfb1d6bf0ca43221b,
-        0x97b0a1b01d6a2955,
-        0x08c60bd622952b30,
-        0x43f2be0f9e24147c,
-        0xfa7268b7d3730f5d,
-        0x43a6c419a23983bb,
-        0xcd77c1f7b29b113c,
-        0xcfa43c9db8eec29f,
-    ],
-    [
-        0xcaaa95a6c7365dec,
-        0x0a91193f798f3be0,
-        0x1104497652735dc6,
-        0x35aecb93663b515e,
-        0x8dbc9916065aa858,
-        0xada8f7a0266579ed,
-        0x524dee7bec1ea789,
-        0xa93aee9dd5af9521,
-    ],
-    [
-        0x9d1f1b54750d707e,
-        0x7c9feab87096d5dc,
-        0xa2e1fb19f9d4261b,
-        0xb714deb448de6346,
-        0x225d1f0d011c5403,
-        0x1549b7f1d28cedc0,
-        0xaef3e46f97d43942,
-        0x6dfc7ffe0b38bf08,
-    ],
-    [
-        0x7de853fdc542b663,
-        0xa68ecc96610657b2,
-        0xe88bb5428af289b1,
-        0xd7cfa1504c5569f5,
-        0x78a9aad0d642d30a,
-        0xd68315f2353dce52,
-        0x46e56300f86fcfd5,
-        0x323d95332b145fd6,
-    ],
-]);
-
-/// Round constants for width-12 Poseidon1 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
-///
-/// Generated by `poseidon/generate_constants.py --field goldilocks --width 12`.
-///
-/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)].
-pub const GOLDILOCKS_POSEIDON1_RC_12: [[Goldilocks; 12]; 30] = Goldilocks::new_2d_array([
-    // Initial full rounds (4)
-    [
-        0x13dcf33aba214f46,
-        0x30b3b654a1da6d83,
-        0x1fc634ada6159b56,
-        0x937459964dc03466,
-        0xedd2ef2ca7949924,
-        0xede9affde0e22f68,
-        0x8515b9d6bac9282d,
-        0x6b5c07b4e9e900d8,
-        0x1ec66368838c8a08,
-        0x9042367d80d1fbab,
-        0x400283564a3c3799,
-        0x4a00be0466bca75e,
-    ],
-    [
-        0x7913beee58e3817f,
-        0xf545e88532237d90,
-        0x22f8cb8736042005,
-        0x6f04990e247a2623,
-        0xfe22e87ba37c38cd,
-        0xd20e32c85ffe2815,
-        0x117227674048fe73,
-        0x4e9fb7ea98a6b145,
-        0xe0866c232b8af08b,
-        0x00bbc77916884964,
-        0x7031c0fb990d7116,
-        0x240a9e87cf35108f,
-    ],
-    [
-        0x2e6363a5a12244b3,
-        0x5e1c3787d1b5011c,
-        0x4132660e2a196e8b,
-        0x3a013b648d3d4327,
-        0xf79839f49888ea43,
-        0xfe85658ebafe1439,
-        0xb6889825a14240bd,
-        0x578453605541382b,
-        0x4508cda8f6b63ce9,
-        0x9c3ef35848684c91,
-        0x0812bde23c87178c,
-        0xfe49638f7f722c14,
-    ],
-    [
-        0x8e3f688ce885cbf5,
-        0xb8e110acf746a87d,
-        0xb4b2e8973a6dabef,
-        0x9e714c5da3d462ec,
-        0x6438f9033d3d0c15,
-        0x24312f7cf1a27199,
-        0x23f843bb47acbf71,
-        0x9183f11a34be9f01,
-        0x839062fbb9d45dbf,
-        0x24b56e7e6c2e43fa,
-        0xe1683da61c962a72,
-        0xa95c63971a19bfa7,
-    ],
-    // Partial rounds (22)
-    [
-        0x4adf842aa75d4316,
-        0xf8fbb871aa4ab4eb,
-        0x68e85b6eb2dd6aeb,
-        0x07a0b06b2d270380,
-        0xd94e0228bd282de4,
-        0x8bdd91d3250c5278,
-        0x209c68b88bba778f,
-        0xb5e18cdab77f3877,
-        0xb296a3e808da93fa,
-        0x8370ecbda11a327e,
-        0x3f9075283775dad8,
-        0xb78095bb23c6aa84,
-    ],
-    [
-        0x3f36b9fe72ad4e5f,
-        0x69bc96780b10b553,
-        0x3f1d341f2eb7b881,
-        0x4e939e9815838818,
-        0xda366b3ae2a31604,
-        0xbc89db1e7287d509,
-        0x6102f411f9ef5659,
-        0x58725c5e7ac1f0ab,
-        0x0df5856c798883e7,
-        0xf7bb62a8da4c961b,
-        0xc68be7c94882a24d,
-        0xaf996d5d5cdaedd9,
-    ],
-    [
-        0x9717f025e7daf6a5,
-        0x6436679e6e7216f4,
-        0x8a223d99047af267,
-        0xbb512e35a133ba9a,
-        0xfbbf44097671aa03,
-        0xf04058ebf6811e61,
-        0x5cca84703fac7ffb,
-        0x9b55c7945de6469f,
-        0x8e05bf09808e934f,
-        0x2ea900de876307d7,
-        0x7748fff2b38dfb89,
-        0x6b99a676dd3b5d81,
-    ],
-    [
-        0xac4bb7c627cf7c13,
-        0xadb6ebe5e9e2f5ba,
-        0x2d33378cafa24ae3,
-        0x1e5b73807543f8c2,
-        0x09208814bfebb10f,
-        0x782e64b6bb5b93dd,
-        0xadd5a48eac90b50f,
-        0xadd4c54c736ea4b1,
-        0xd58dbb86ed817fd8,
-        0x6d5ed1a533f34ddd,
-        0x28686aa3e36b7cb9,
-        0x591abd3476689f36,
-    ],
-    [
-        0x047d766678f13875,
-        0xa2a11112625f5b49,
-        0x21fd10a3f8304958,
-        0xf9b40711443b0280,
-        0xd2697eb8b2bde88e,
-        0x3493790b51731b3f,
-        0x11caf9dd73764023,
-        0x7acfb8f72878164e,
-        0x744ec4db23cefc26,
-        0x1e00e58f422c6340,
-        0x21dd28d906a62dda,
-        0xf32a46ab5f465b5f,
-    ],
-    [
-        0xbfce13201f3f7e6b,
-        0xf30d2e7adb5304e2,
-        0xecdf4ee4abad48e9,
-        0xf94e82182d395019,
-        0x4ee52e3744d887c5,
-        0xa1341c7cac0083b2,
-        0x2302fb26c30c834a,
-        0xaea3c587273bf7d3,
-        0xf798e24961823ec7,
-        0x962deba3e9a2cd94,
-        0xb36ee79485ca4707,
-        0xd380199eddd2de52,
-    ],
-    [
-        0x70971fc4e6f85305,
-        0x8e722f6e5dc32699,
-        0xa0883df133052b92,
-        0x8f86c6a3eb7d01a4,
-        0x763649c8b670bdc5,
-        0x830d5c82b808759b,
-        0xaa1da8bb91da02e7,
-        0x9bc9bf629e211c4d,
-        0x0f0a899b10a4dea8,
-        0xb883bdcee7c6b356,
-        0x78c7101e7496ae1e,
-        0x2fd6c5a8bf1e5ca6,
-    ],
-    [
-        0xe2a6e06e61fcec9c,
-        0xebfce7d5c5b3dbd5,
-        0xca2eeca4bb485d85,
-        0xc2b875537c42eb69,
-        0x6faf849976873328,
-        0xfc3fcb6e81ad4cc3,
-        0x180dd95503955a28,
-        0xd40f19a3c9fe1520,
-        0x49d178ddbf7fd96d,
-        0x3950bee2e10e0297,
-        0x437b90cf295be062,
-        0xa5cd126edffad23b,
-    ],
-    [
-        0xdf58134c134491c2,
-        0x0677eca229d9f7bd,
-        0x492200a1f7d83a3c,
-        0xafb58c9810a43645,
-        0x7659077c5a9c208e,
-        0x30b4bc83706995cd,
-        0xc98fa77bbbef3a3b,
-        0x84a82905750b3109,
-        0x72f2a02326aeb69b,
-        0x8d27a2a2d73a848a,
-        0xaa9e30a80bde4b68,
-        0x63abb1415e050474,
-    ],
-    [
-        0x1c4bd1e816050a7e,
-        0x15d1502e4f469dfd,
-        0x53989d594b0c4cd8,
-        0x7a1a4c83cb7e377e,
-        0x1b52f8a9944e480e,
-        0xeb7b03f76a91a79e,
-        0x0073a4fc9328c69e,
-        0x2c7b16f8620d9de4,
-        0x950d052963e46bc4,
-        0x8d201ba1a9c89fac,
-        0xd3502941bdf35503,
-        0x7c6dfcd5af8676fb,
-    ],
-    [
-        0xf8a6cd02e92cdb0b,
-        0x6e7500f3a5464b22,
-        0x07637eabba4bdd20,
-        0x88b82717beee0e14,
-        0xbaa2b1cd3dd4c79a,
-        0xdfecc3aebec4cfa6,
-        0x7561087b0cff0166,
-        0x538fcac317a703a6,
-        0xd7d6c6eeeeeeea19,
-        0xd647b1ee441658a0,
-        0xdf4442110236c546,
-        0x559ef2c6dd73ec15,
-    ],
-    [
-        0x4c0f5fc6c0dda3d1,
-        0x685010cc3100cea7,
-        0x2fb6ba8aa0344440,
-        0xb515f0a3ca75f1fb,
-        0x886887eaecb87c10,
-        0xf03ec3fd710abb04,
-        0xd3b4763e17f543ef,
-        0x50d9e5716e78083a,
-        0x0bce2385cf8d74ff,
-        0xaf23032cd5f0e04b,
-        0xd366aa112b6159d9,
-        0x810a3ad3ac7979db,
-    ],
-    [
-        0x0a4a11d794be40a2,
-        0xeebf0cf23b668a3f,
-        0x600873fb011d761b,
-        0x0bfb5591a02ff618,
-        0xa16e2a528910af52,
-        0xf6553653e2878421,
-        0xccbe7c7a601a30c0,
-        0xb18b214fe489f5b3,
-        0xe21017ab9e153425,
-        0x586099ede17af9a6,
-        0x385078b514f50647,
-        0xc02b3a9afb89883d,
-    ],
-    [
-        0x6d3fbd3b4a9f1de6,
-        0x4b4d40a41b0f473c,
-        0x838f1887b8f31711,
-        0x9396895be5c58a41,
-        0x6247a479d66fc2e3,
-        0x13fe228a98f2d0a2,
-        0x5ba5fde765f9481e,
-        0xafb89fa62267e117,
-        0xfa4dc1bebcaa6333,
-        0xdbab590882b87289,
-        0xc3b6c08e23ba9301,
-        0xd84b5de94a324fb7,
-    ],
-    [
-        0x0d0c371c5b35b850,
-        0x7964f570e7188038,
-        0x5daf18bbd996604c,
-        0x6743bc47b9595258,
-        0x5528b9362c59bb71,
-        0xac45e25b7127b68c,
-        0xa2077d7dfbb606b6,
-        0xf3faac6faee378af,
-        0x0c6388b51545e884,
-        0xd27dbb6944917b61,
-        0x89bcac584344c104,
-        0x856bab802ce7402d,
-    ],
-    [
-        0x2cff3000be1fcd0a,
-        0x765f2977fa72a917,
-        0x1443711329f5f9d5,
-        0xd35cd0261af2f951,
-        0x2a1bb986084ec281,
-        0x2334a54b758f23f2,
-        0xa9b8cb612caf706b,
-        0xb6ba11c4ab1a1017,
-        0xde96b0824b4b46e2,
-        0xc59d4272c6d92e2c,
-        0x389bb5107611754d,
-        0x23647fbc77657372,
-    ],
-    [
-        0xd5ef60d6f76a42fa,
-        0xebb406bb79ac9819,
-        0x55faccc709a2f423,
-        0xd9d6ea97490091cd,
-        0xef3ce5069647a7e4,
-        0xdf31625d3fa78464,
-        0x242e60fd68f10f66,
-        0x39c966cc815f084d,
-        0x20e2e22e02bae3f7,
-        0xb38919d3f1173d7c,
-        0xf17769f6c77084d9,
-        0xcc051d8094cac41f,
-    ],
-    [
-        0x942069f5d6eece7e,
-        0x8d61d3e6f141c572,
-        0xc5cef9d85dd605f4,
-        0x938f2ac2bf885997,
-        0x23bddbace7c48f6c,
-        0xc90a6c5ba98537e4,
-        0x0be6ee2cca90f6ae,
-        0xa026175394ae0e90,
-        0x29fca3e314c77628,
-        0x2aa2aa8738ab7b77,
-        0xe11bbd31fbb8cac6,
-        0xb5bbbef1b78a23af,
-    ],
-    [
-        0x8b62a5551e9a9797,
-        0x3f91073d4d491c80,
-        0x4cfa44976396424a,
-        0xf8dcb2dfb3aa1b44,
-        0x3849409eba1a95f5,
-        0x070845799f234380,
-        0x184c0093667da1ba,
-        0xbd66aafccd51601e,
-        0xee6d14e92155b490,
-        0x626f2ec1865bc544,
-        0x1bd2854bf6485986,
-        0x368b8497472f12ef,
-    ],
-    [
-        0x4f88cdcdfb791921,
-        0xe2c0acfeda9ae781,
-        0x9739bc21773469b3,
-        0x00ce3ad64dc4bb8f,
-        0xaab85a321ee7a4c8,
-        0xd5de825be97004f4,
-        0x48d676d3a043b1c6,
-        0x9c6180b1ff643097,
-        0x34882a89dd590b09,
-        0xae7e6b0d249c3b1d,
-        0x8c016908a04885a1,
-        0x83ebaaebc9ae0721,
-    ],
-    [
-        0xab21b42e0f642307,
-        0xdb46631f62bb29c1,
-        0xef29f0399e09b5d9,
-        0x5b52fbb3613b8ba1,
-        0x57e129fcc96922e6,
-        0xcdeb14c9d9204b3a,
-        0x1341ef0da8536e34,
-        0xd7e3400f2bacde63,
-        0x6911eeb42f70d7e5,
-        0xc3a2a910a4679767,
-        0x1773cbe4a0f6bb28,
-        0xe17b0d53e843eab5,
-    ],
-    [
-        0x587fa39990b62800,
-        0x0d5d32788135879d,
-        0x277f7b31fd3a4cdb,
-        0xa435290ee56d7efa,
-        0xea6f40be35159925,
-        0xcb73377a506171cb,
-        0xe43c367ce731d82a,
-        0x6eb305031ca10c43,
-        0xc019a8c622cc84cb,
-        0xd5614f5658c612e6,
-        0x7b1ecbe957c3ff98,
-        0x60db6ee9651a8478,
-    ],
-    // Terminal full rounds (4)
-    [
-        0x9271d450fc9b4117,
-        0xcffeea06b6e3aac1,
-        0xfa4a44c748d1cd8e,
-        0xe64db01ba569b469,
-        0xd31005160e4045fe,
-        0x39e0fa013e025f79,
-        0xe243be574196a956,
-        0x205b2a681e3d2642,
-        0x79cae5ad93486bab,
-        0xfdf567844e32c295,
-        0x331679589bfb7189,
-        0xaf06ee32297b89c2,
-    ],
-    [
-        0xa6bcae311e498491,
-        0x9d16f52c96ac8b3e,
-        0x48a674b59393fa35,
-        0x0f9e65da3fde3796,
-        0x1e098310fc84578c,
-        0x559ae5fab1ae8dad,
-        0x56bd4d624078881d,
-        0xfd8bbbf8fbe817b5,
-        0x82d30695c44df534,
-        0x3ec0a97bc41127c5,
-        0x1eb8b64adaa22078,
-        0x82c45e418d60c983,
-    ],
-    [
-        0xb092280f484d55bf,
-        0xcd317c9537697939,
-        0xd3be2e352feb79f3,
-        0xca6d866539a390e5,
-        0xb5efb1a494e55ee6,
-        0xfa9013ac89756e9e,
-        0xaeb88efd1e981242,
-        0x13ee477cdab6e0dc,
-        0xce7df902c40da2d3,
-        0xf3fbaf0d4e6f5f34,
-        0xf96354ada6785f38,
-        0x13b5692812406886,
-    ],
-    [
-        0xf03cae030a0f4418,
-        0x7d3172887aa98e1a,
-        0x8a2c2644f2faf7b9,
-        0x80d721abee696d00,
-        0x27c8b903a4d68267,
-        0xaf0b7b12f90291b8,
-        0x00acd08cfdff3817,
-        0x4659ee496c634328,
-        0xf5b25c10730dbff1,
-        0xdde3a153297329c2,
-        0x50c0b70d6910a44b,
-        0x23c7426af725a6a0,
-    ],
-]);
-
-/// Create the default width-8 Poseidon1 permutation for Goldilocks.
-///
-/// Returns the platform-optimal implementation: dual-dispatch on aarch64
-/// (generic for scalar, fused ASM for packed), generic Karatsuba on all
-/// other platforms.
-#[cfg(target_arch = "aarch64")]
-pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> {
-    let constants = Poseidon1Constants {
-        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-        mds_circ_col: MATRIX_CIRC_MDS_8_COL,
-        round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(),
-    };
-    let (full, partial) = constants.to_optimized();
-    let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial);
-    crate::Poseidon1GoldilocksDispatch::new(fused, full, partial)
-}
-
-/// Create the default width-8 Poseidon1 permutation for Goldilocks.
-///
-/// Returns the platform-optimal implementation: fused ASM on aarch64,
-/// generic Karatsuba on all other platforms.
-#[cfg(not(target_arch = "aarch64"))]
-pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> {
-    Poseidon1::new(&Poseidon1Constants {
-        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-        mds_circ_col: MATRIX_CIRC_MDS_8_COL,
-        round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(),
-    })
-}
-
-/// Create the default width-12 Poseidon1 permutation for Goldilocks.
-///
-/// Returns the platform-optimal implementation: dual-dispatch on aarch64
-/// (generic for scalar, fused ASM for packed), generic Karatsuba on all
-/// other platforms.
-#[cfg(target_arch = "aarch64")]
-pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> {
-    let constants = Poseidon1Constants {
-        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12,
-        mds_circ_col: MATRIX_CIRC_MDS_12_COL,
-        round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(),
-    };
-    let (full, partial) = constants.to_optimized();
-    let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial);
-    crate::Poseidon1GoldilocksDispatch::new(fused, full, partial)
-}
-
-/// Create the default width-12 Poseidon1 permutation for Goldilocks.
-///
-/// Returns the platform-optimal implementation: fused ASM on aarch64,
-/// generic Karatsuba on all other platforms.
-#[cfg(not(target_arch = "aarch64"))]
-pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> {
-    Poseidon1::new(&Poseidon1Constants {
-        rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-        rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12,
-        mds_circ_col: MATRIX_CIRC_MDS_12_COL,
-        round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(),
-    })
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_symmetric::Permutation;
-    use rand::SeedableRng;
-    use rand::rngs::SmallRng;
-
-    use super::*;
-
-    type F = Goldilocks;
-
-    /// Known-answer test for width 8 (sequential 0..7 input).
-    #[test]
-    fn test_poseidon_goldilocks_width_8() {
-        let perm = default_goldilocks_poseidon1_8();
-
-        let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
-        perm.permute_mut(&mut input);
-
-        let expected: [F; 8] = F::new_array([
-            2431226948502761687,
-            9427563026145807618,
-            6827549936272051660,
-            16907684411084503785,
-            10131745626715172913,
-            17448305483431576765,
-            9066501914269485014,
-            12095238468458521303,
-        ]);
-        assert_eq!(input, expected);
-    }
-
-    /// Known-answer test for width 12 (sequential 0..11 input).
-    #[test]
-    fn test_poseidon_goldilocks_width_12() {
-        let perm = default_goldilocks_poseidon1_12();
-
-        let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
-        perm.permute_mut(&mut input);
-
-        let expected: [F; 12] = F::new_array([
-            15595088881848875364,
-            9564850329150784619,
-            13607005230761744521,
-            12117102595842533385,
-            2814257411756993122,
-            11640647689983397089,
-            14363867760831937423,
-            13323891071259596526,
-            11219803511311150468,
-            9221595262780869902,
-            5898229059046891887,
-            18181291031484020550,
-        ]);
-        assert_eq!(input, expected);
-    }
-
-    /// Smoke test for width 16 with random constants.
-    /// Uses the generic type directly since the fused type only supports 8 and 12.
-    #[test]
-    fn test_poseidon_goldilocks_width_16() {
-        let mut rng = SmallRng::seed_from_u64(1);
-        let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng(
-            GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-            GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-            &MdsMatrixGoldilocks,
-            &mut rng,
-        );
-        let input: [F; 16] = rand::RngExt::random(&mut rng);
-        let output = poseidon.permute(input);
-        assert_ne!(output, input);
-    }
-
-    /// Smoke test for width 24 with random constants.
-    #[test]
-    fn test_poseidon_goldilocks_width_24() {
-        let mut rng = SmallRng::seed_from_u64(1);
-        let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng(
-            GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-            GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-            &MdsMatrixGoldilocks,
-            &mut rng,
-        );
-        let input: [F; 24] = rand::RngExt::random(&mut rng);
-        let output = poseidon.permute(input);
-        assert_ne!(output, input);
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
-    mod avx512 {
-        use super::*;
-        use crate::PackedGoldilocksAVX512;
-
-        #[test]
-        fn test_avx512_poseidon_width_16() {
-            let mut rng = SmallRng::seed_from_u64(1);
-            let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng(
-                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-                &MdsMatrixGoldilocks,
-                &mut rng,
-            );
-            let input: [F; 16] = rand::RngExt::random(&mut rng);
-
-            let mut expected = input;
-            poseidon.permute_mut(&mut expected);
-
-            let mut avx512_input = input.map(Into::<PackedGoldilocksAVX512>::into);
-            poseidon.permute_mut(&mut avx512_input);
-
-            let avx512_output = avx512_input.map(|x| x.0[0]);
-            assert_eq!(avx512_output, expected);
-        }
-
-        #[test]
-        fn test_avx512_poseidon_width_24() {
-            let mut rng = SmallRng::seed_from_u64(1);
-            let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng(
-                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-                &MdsMatrixGoldilocks,
-                &mut rng,
-            );
-            let input: [F; 24] = rand::RngExt::random(&mut rng);
-
-            let mut expected = input;
-            poseidon.permute_mut(&mut expected);
-
-            let mut avx512_input = input.map(Into::<PackedGoldilocksAVX512>::into);
-            poseidon.permute_mut(&mut avx512_input);
-
-            let avx512_output = avx512_input.map(|x| x.0[0]);
-            assert_eq!(avx512_output, expected);
-        }
-    }
-
-    #[cfg(all(
-        target_arch = "x86_64",
-        target_feature = "avx2",
-        not(target_feature = "avx512f")
-    ))]
-    mod avx2 {
-        use super::*;
-        use crate::PackedGoldilocksAVX2;
-
-        #[test]
-        fn test_avx2_poseidon_width_16() {
-            let mut rng = SmallRng::seed_from_u64(1);
-            let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng(
-                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-                &MdsMatrixGoldilocks,
-                &mut rng,
-            );
-            let input: [F; 16] = rand::RngExt::random(&mut rng);
-
-            let mut expected = input;
-            poseidon.permute_mut(&mut expected);
-
-            let mut avx2_input = input.map(Into::<PackedGoldilocksAVX2>::into);
-            poseidon.permute_mut(&mut avx2_input);
-
-            let avx2_output = avx2_input.map(|x| x.0[0]);
-            assert_eq!(avx2_output, expected);
-        }
-
-        #[test]
-        fn test_avx2_poseidon_width_24() {
-            let mut rng = SmallRng::seed_from_u64(1);
-            let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng(
-                GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS,
-                GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8,
-                &MdsMatrixGoldilocks,
-                &mut rng,
-            );
-            let input: [F; 24] = rand::RngExt::random(&mut rng);
-
-            let mut expected = input;
-            poseidon.permute_mut(&mut expected);
-
-            let mut avx2_input = input.map(Into::<PackedGoldilocksAVX2>::into);
-            poseidon.permute_mut(&mut avx2_input);
-
-            let avx2_output = avx2_input.map(|x| x.0[0]);
-            assert_eq!(avx2_output, expected);
-        }
-    }
-
-    #[cfg(target_arch = "aarch64")]
-    mod neon {
-        use super::*;
-        use crate::PackedGoldilocksNeon;
-
-        #[test]
-        fn test_neon_poseidon_width_8() {
-            let perm = default_goldilocks_poseidon1_8();
-            let input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
-
-            let mut expected = input;
-            perm.permute_mut(&mut expected);
-
-            let mut neon_input = input.map(Into::<PackedGoldilocksNeon>::into);
-            perm.permute_mut(&mut neon_input);
-
-            let neon_output = neon_input.map(|x| x.0[0]);
-            assert_eq!(neon_output, expected);
-        }
-
-        #[test]
-        fn test_neon_poseidon_width_12() {
-            let perm = default_goldilocks_poseidon1_12();
-            let input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
-
-            let mut expected = input;
-            perm.permute_mut(&mut expected);
-
-            let mut neon_input = input.map(Into::<PackedGoldilocksNeon>::into);
-            perm.permute_mut(&mut neon_input);
-
-            let neon_output = neon_input.map(|x| x.0[0]);
-            assert_eq!(neon_output, expected);
-        }
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs
deleted file mode 100644
index b5d158610..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs
+++ /dev/null
@@ -1,980 +0,0 @@
-//! Implementation of Poseidon2, see: https://eprint.iacr.org/2023/323
-
-use alloc::vec::Vec;
-
-use p3_field::{Algebra, InjectiveMonomial, PrimeCharacteristicRing};
-#[cfg(not(target_arch = "aarch64"))]
-use p3_poseidon2::Poseidon2;
-use p3_poseidon2::{
-    ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, GenericPoseidon2LinearLayers,
-    InternalLayer, InternalLayerConstructor, MDSMat4, add_rc_and_sbox_generic,
-    external_initial_permute_state, external_terminal_permute_state, internal_permute_state,
-    matmul_internal,
-};
-
-use crate::Goldilocks;
-use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE;
-
-/// Number of full rounds per half for Goldilocks Poseidon2 (`RF / 2`).
-///
-/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending).
-/// Follows the Poseidon2 paper's security analysis with a +2 RF margin.
-pub const GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS: usize = 4;
-
-/// Number of partial rounds for Goldilocks Poseidon2 (width 8).
-///
-/// Derived from the interpolation bound in the Poseidon paper (Eq. 3):
-///
-///   R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5
-///            = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20
-///
-/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
-pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8: usize = 22;
-
-/// Number of partial rounds for Goldilocks Poseidon2 (width 12).
-///
-/// Same interpolation bound as width 8:
-///
-///   R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20
-///
-/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22.
-pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_12: usize = 22;
-
-/// An implementation of the Poseidon2 hash function for the Goldilocks field.
-///
-/// It acts on arrays of the form `[Goldilocks; WIDTH]`.
-#[cfg(target_arch = "aarch64")]
-pub type Poseidon2Goldilocks<const WIDTH: usize> = crate::Poseidon2GoldilocksFused<WIDTH>;
-
-/// An implementation of the Poseidon2 hash function for the Goldilocks field.
-///
-/// It acts on arrays of the form `[Goldilocks; WIDTH]`.
-#[cfg(not(target_arch = "aarch64"))]
-pub type Poseidon2Goldilocks<const WIDTH: usize> = Poseidon2<
-    Goldilocks,
-    Poseidon2ExternalLayerGoldilocks<WIDTH>,
-    Poseidon2InternalLayerGoldilocks,
-    WIDTH,
-    GOLDILOCKS_S_BOX_DEGREE,
->;
-
-/// Round constants for width-8 Poseidon2 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
-///
-/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
-///
-/// Layout: external_initial (4 rounds × 8 elements).
-pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL: [[Goldilocks; 8]; 4] = [
-    Goldilocks::new_array([
-        0xdd5743e7f2a5a5d9,
-        0xcb3a864e58ada44b,
-        0xffa2449ed32f8cdc,
-        0x42025f65d6bd13ee,
-        0x7889175e25506323,
-        0x34b98bb03d24b737,
-        0xbdcc535ecc4faa2a,
-        0x5b20ad869fc0d033,
-    ]),
-    Goldilocks::new_array([
-        0xf1dda5b9259dfcb4,
-        0x27515210be112d59,
-        0x4227d1718c766c3f,
-        0x26d333161a5bd794,
-        0x49b938957bf4b026,
-        0x4a56b5938b213669,
-        0x1120426b48c8353d,
-        0x6b323c3f10a56cad,
-    ]),
-    Goldilocks::new_array([
-        0xce57d6245ddca6b2,
-        0xb1fc8d402bba1eb1,
-        0xb5c5096ca959bd04,
-        0x6db55cd306d31f7f,
-        0xc49d293a81cb9641,
-        0x1ce55a4fe979719f,
-        0xa92e60a9d178a4d1,
-        0x002cc64973bcfd8c,
-    ]),
-    Goldilocks::new_array([
-        0xcea721cce82fb11b,
-        0xe5b55eb8098ece81,
-        0x4e30525c6f1ddd66,
-        0x43c6702827070987,
-        0xaca68430a7b5762a,
-        0x3674238634df9c93,
-        0x88cee1c825e33433,
-        0xde99ae8d74b57176,
-    ]),
-];
-
-/// Round constants for width-8 Poseidon2 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
-///
-/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
-///
-/// Layout: external_final (4 rounds × 8 elements).
-pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL: [[Goldilocks; 8]; 4] = [
-    Goldilocks::new_array([
-        0x014ef1197d341346,
-        0x9725e20825d07394,
-        0xfdb25aef2c5bae3b,
-        0xbe5402dc598c971e,
-        0x93a5711f04cdca3d,
-        0xc45a9a5b2f8fb97b,
-        0xfe8946a924933545,
-        0x2af997a27369091c,
-    ]),
-    Goldilocks::new_array([
-        0xaa62c88e0b294011,
-        0x058eb9d810ce9f74,
-        0xb3cb23eced349ae4,
-        0xa3648177a77b4a84,
-        0x43153d905992d95d,
-        0xf4e2a97cda44aa4b,
-        0x5baa2702b908682f,
-        0x082923bdf4f750d1,
-    ]),
-    Goldilocks::new_array([
-        0x98ae09a325893803,
-        0xf8a6475077968838,
-        0xceb0735bf00b2c5f,
-        0x0a1a5d953888e072,
-        0x2fcb190489f94475,
-        0xb5be06270dec69fc,
-        0x739cb934b09acf8b,
-        0x537750b75ec7f25b,
-    ]),
-    Goldilocks::new_array([
-        0xe9dd318bae1f3961,
-        0xf7462137299efe1a,
-        0xb1f6b8eee9adb940,
-        0xbdebcc8a809dfe6b,
-        0x40fc1f791b178113,
-        0x3ac1c3362d014864,
-        0x9a016184bdb8aeba,
-        0x95f2394459fbc25e,
-    ]),
-];
-
-/// Round constants for width-8 Poseidon2 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22
-///
-/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`.
-///
-/// Layout: internal (22 scalar constants).
-pub const GOLDILOCKS_POSEIDON2_RC_8_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([
-    0x488897d85ff51f56,
-    0x1140737ccb162218,
-    0xa7eeb9215866ed35,
-    0x9bd2976fee49fcc9,
-    0xc0c8f0de580a3fcc,
-    0x4fb2dae6ee8fc793,
-    0x343a89f35f37395b,
-    0x223b525a77ca72c8,
-    0x56ccb62574aaa918,
-    0xc4d507d8027af9ed,
-    0xa080673cf0b7e95c,
-    0xf0184884eb70dcf8,
-    0x044f10b0cb3d5c69,
-    0xe9e3f7993938f186,
-    0x1b761c80e772f459,
-    0x606cec607a1b5fac,
-    0x14a0c2e1d45f03cd,
-    0x4eace8855398574f,
-    0xf905ca7103eff3e6,
-    0xf8c8f8d20862c059,
-    0xb524fe8bdd678e5a,
-    0xfbb7865901a1ec41,
-]);
-
-/// Round constants for width-12 Poseidon2 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
-///
-/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
-///
-/// Layout: external_initial (4 rounds × 12 elements).
-pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL: [[Goldilocks; 12]; 4] = [
-    Goldilocks::new_array([
-        0x13dcf33aba214f46,
-        0x30b3b654a1da6d83,
-        0x1fc634ada6159b56,
-        0x937459964dc03466,
-        0xedd2ef2ca7949924,
-        0xede9affde0e22f68,
-        0x8515b9d6bac9282d,
-        0x6b5c07b4e9e900d8,
-        0x1ec66368838c8a08,
-        0x9042367d80d1fbab,
-        0x400283564a3c3799,
-        0x4a00be0466bca75e,
-    ]),
-    Goldilocks::new_array([
-        0x7913beee58e3817f,
-        0xf545e88532237d90,
-        0x22f8cb8736042005,
-        0x6f04990e247a2623,
-        0xfe22e87ba37c38cd,
-        0xd20e32c85ffe2815,
-        0x117227674048fe73,
-        0x4e9fb7ea98a6b145,
-        0xe0866c232b8af08b,
-        0x00bbc77916884964,
-        0x7031c0fb990d7116,
-        0x240a9e87cf35108f,
-    ]),
-    Goldilocks::new_array([
-        0x2e6363a5a12244b3,
-        0x5e1c3787d1b5011c,
-        0x4132660e2a196e8b,
-        0x3a013b648d3d4327,
-        0xf79839f49888ea43,
-        0xfe85658ebafe1439,
-        0xb6889825a14240bd,
-        0x578453605541382b,
-        0x4508cda8f6b63ce9,
-        0x9c3ef35848684c91,
-        0x0812bde23c87178c,
-        0xfe49638f7f722c14,
-    ]),
-    Goldilocks::new_array([
-        0x8e3f688ce885cbf5,
-        0xb8e110acf746a87d,
-        0xb4b2e8973a6dabef,
-        0x9e714c5da3d462ec,
-        0x6438f9033d3d0c15,
-        0x24312f7cf1a27199,
-        0x23f843bb47acbf71,
-        0x9183f11a34be9f01,
-        0x839062fbb9d45dbf,
-        0x24b56e7e6c2e43fa,
-        0xe1683da61c962a72,
-        0xa95c63971a19bfa7,
-    ]),
-];
-
-/// Round constants for width-12 Poseidon2 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
-///
-/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
-///
-/// Layout: external_final (4 rounds × 12 elements).
-pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL: [[Goldilocks; 12]; 4] = [
-    Goldilocks::new_array([
-        0xc68be7c94882a24d,
-        0xaf996d5d5cdaedd9,
-        0x9717f025e7daf6a5,
-        0x6436679e6e7216f4,
-        0x8a223d99047af267,
-        0xbb512e35a133ba9a,
-        0xfbbf44097671aa03,
-        0xf04058ebf6811e61,
-        0x5cca84703fac7ffb,
-        0x9b55c7945de6469f,
-        0x8e05bf09808e934f,
-        0x2ea900de876307d7,
-    ]),
-    Goldilocks::new_array([
-        0x7748fff2b38dfb89,
-        0x6b99a676dd3b5d81,
-        0xac4bb7c627cf7c13,
-        0xadb6ebe5e9e2f5ba,
-        0x2d33378cafa24ae3,
-        0x1e5b73807543f8c2,
-        0x09208814bfebb10f,
-        0x782e64b6bb5b93dd,
-        0xadd5a48eac90b50f,
-        0xadd4c54c736ea4b1,
-        0xd58dbb86ed817fd8,
-        0x6d5ed1a533f34ddd,
-    ]),
-    Goldilocks::new_array([
-        0x28686aa3e36b7cb9,
-        0x591abd3476689f36,
-        0x047d766678f13875,
-        0xa2a11112625f5b49,
-        0x21fd10a3f8304958,
-        0xf9b40711443b0280,
-        0xd2697eb8b2bde88e,
-        0x3493790b51731b3f,
-        0x11caf9dd73764023,
-        0x7acfb8f72878164e,
-        0x744ec4db23cefc26,
-        0x1e00e58f422c6340,
-    ]),
-    Goldilocks::new_array([
-        0x21dd28d906a62dda,
-        0xf32a46ab5f465b5f,
-        0xbfce13201f3f7e6b,
-        0xf30d2e7adb5304e2,
-        0xecdf4ee4abad48e9,
-        0xf94e82182d395019,
-        0x4ee52e3744d887c5,
-        0xa1341c7cac0083b2,
-        0x2302fb26c30c834a,
-        0xaea3c587273bf7d3,
-        0xf798e24961823ec7,
-        0x962deba3e9a2cd94,
-    ]),
-];
-
-/// Round constants for width-12 Poseidon2 on Goldilocks.
-///
-/// Generated by the Grain LFSR with parameters:
-///     field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22
-///
-/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`.
-///
-/// Layout: internal (22 scalar constants).
-pub const GOLDILOCKS_POSEIDON2_RC_12_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([
-    0x4adf842aa75d4316,
-    0xf8fbb871aa4ab4eb,
-    0x68e85b6eb2dd6aeb,
-    0x07a0b06b2d270380,
-    0xd94e0228bd282de4,
-    0x8bdd91d3250c5278,
-    0x209c68b88bba778f,
-    0xb5e18cdab77f3877,
-    0xb296a3e808da93fa,
-    0x8370ecbda11a327e,
-    0x3f9075283775dad8,
-    0xb78095bb23c6aa84,
-    0x3f36b9fe72ad4e5f,
-    0x69bc96780b10b553,
-    0x3f1d341f2eb7b881,
-    0x4e939e9815838818,
-    0xda366b3ae2a31604,
-    0xbc89db1e7287d509,
-    0x6102f411f9ef5659,
-    0x58725c5e7ac1f0ab,
-    0x0df5856c798883e7,
-    0xf7bb62a8da4c961b,
-]);
-
-/// Create a default width-8 Poseidon2 permutation for Goldilocks.
-#[cfg(not(target_arch = "aarch64"))]
-pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> {
-    Poseidon2::new(
-        ExternalLayerConstants::new(
-            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(),
-            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(),
-        ),
-        GOLDILOCKS_POSEIDON2_RC_8_INTERNAL.to_vec(),
-    )
-}
-
-/// Create a default width-8 Poseidon2 permutation for Goldilocks.
-#[cfg(target_arch = "aarch64")]
-pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> {
-    crate::Poseidon2GoldilocksFused::new(
-        &ExternalLayerConstants::new(
-            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(),
-            GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(),
-        ),
-        &GOLDILOCKS_POSEIDON2_RC_8_INTERNAL,
-    )
-}
-
-/// Create a default width-12 Poseidon2 permutation for Goldilocks.
-#[cfg(not(target_arch = "aarch64"))]
-pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> {
-    Poseidon2::new(
-        ExternalLayerConstants::new(
-            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(),
-            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(),
-        ),
-        GOLDILOCKS_POSEIDON2_RC_12_INTERNAL.to_vec(),
-    )
-}
-
-/// Create a default width-12 Poseidon2 permutation for Goldilocks.
-#[cfg(target_arch = "aarch64")]
-pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> {
-    crate::Poseidon2GoldilocksFused::new(
-        &ExternalLayerConstants::new(
-            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(),
-            GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(),
-        ),
-        &GOLDILOCKS_POSEIDON2_RC_12_INTERNAL,
-    )
-}
-
-pub const MATRIX_DIAG_8_GOLDILOCKS: [Goldilocks; 8] = Goldilocks::new_array([
-    0xfffffffeffffffff, // -2
-    0x0000000000000001, // 1
-    0x0000000000000002, // 2
-    0x7fffffff80000001, // 1/2
-    0x0000000000000003, // 3
-    0x7fffffff80000000, // -1/2
-    0xfffffffefffffffe, // -3
-    0xfffffffefffffffd, // -4
-]);
-
-pub const MATRIX_DIAG_12_GOLDILOCKS: [Goldilocks; 12] = Goldilocks::new_array([
-    0xfffffffeffffffff, // -2
-    0x0000000000000001, // 1
-    0x0000000000000002, // 2
-    0x7fffffff80000001, // 1/2
-    0x0000000000000003, // 3
-    0x0000000000000004, // 4
-    0x7fffffff80000000, // -1/2
-    0xfffffffefffffffe, // -3
-    0xfffffffefffffffd, // -4
-    0xbfffffff40000001, // 1/2^2
-    0x3fffffffc0000000, // -1/2^2
-    0xdfffffff20000001, // 1/2^3
-]);
-
-pub const MATRIX_DIAG_16_GOLDILOCKS: [Goldilocks; 16] = Goldilocks::new_array([
-    0xfffffffeffffffff, // -2
-    0x0000000000000001, // 1
-    0x0000000000000002, // 2
-    0x7fffffff80000001, // 1/2
-    0x0000000000000003, // 3
-    0x0000000000000004, // 4
-    0x7fffffff80000000, // -1/2
-    0xfffffffefffffffe, // -3
-    0xfffffffefffffffd, // -4
-    0xdfffffff20000001, // 1/2^3
-    0xefffffff10000001, // 1/2^4
-    0xf7ffffff08000001, // 1/2^5
-    0x1fffffffe0000000, // -1/2^3
-    0x0ffffffff0000000, // -1/2^4
-    0x07fffffff8000000, // -1/2^5
-    0xfffffffe00000002, // 1/2^32
-]);
-
-pub const MATRIX_DIAG_20_GOLDILOCKS: [Goldilocks; 20] = Goldilocks::new_array([
-    0x95c381fda3b1fa57,
-    0xf36fe9eb1288f42c,
-    0x89f5dcdfef277944,
-    0x106f22eadeb3e2d2,
-    0x684e31a2530e5111,
-    0x27435c5d89fd148e,
-    0x3ebed31c414dbf17,
-    0xfd45b0b2d294e3cc,
-    0x48c904473a7f6dbf,
-    0xe0d1b67809295b4d,
-    0xddd1941e9d199dcb,
-    0x8cfe534eeb742219,
-    0xa6e5261d9e3b8524,
-    0x6897ee5ed0f82c1b,
-    0x0e7dcd0739ee5f78,
-    0x493253f3d0d32363,
-    0xbb2737f5845f05c0,
-    0xa187e810b06ad903,
-    0xb635b995936c4918,
-    0x0b3694a940bd2394,
-]);
-
-fn internal_layer_mat_mul_goldilocks_8<A: Algebra<Goldilocks>>(state: &mut [A; 8]) {
-    let sum: A = state.iter().map(|r| r.dup()).sum();
-
-    let s0 = state[0].dup();
-    let s1 = state[1].dup();
-    let s2 = state[2].dup();
-    let s3 = state[3].dup();
-    let s4 = state[4].dup();
-    let s5 = state[5].dup();
-    let s6 = state[6].dup();
-    let s7 = state[7].dup();
-
-    // V[0] = -2
-    let two_s0 = s0.dup() + s0;
-    state[0] = sum.dup() - two_s0;
-
-    // V[1] = 1
-    state[1] = sum.dup() + s1;
-
-    // V[2] = 2
-    let two_s2 = s2.dup() + s2;
-    state[2] = sum.dup() + two_s2;
-
-    // V[3] = 1/2
-    state[3] = sum.dup() + s3.halve();
-
-    // V[4] = 3
-    let two_s4 = s4.dup() + s4.dup();
-    let three_s4 = two_s4 + s4;
-    state[4] = sum.dup() + three_s4;
-
-    // V[5] = -1/2
-    state[5] = sum.dup() - s5.halve();
-
-    // V[6] = -3
-    let two_s6 = s6.dup() + s6.dup();
-    let three_s6 = two_s6 + s6;
-    state[6] = sum.dup() - three_s6;
-
-    // V[7] = -4
-    let two_s7 = s7.dup() + s7;
-    let four_s7 = two_s7.dup() + two_s7;
-    state[7] = sum - four_s7;
-}
-
-fn internal_layer_mat_mul_goldilocks_12<A: Algebra<Goldilocks>>(state: &mut [A; 12]) {
-    let sum: A = state.iter().map(|r| r.dup()).sum();
-
-    let s0 = state[0].dup();
-    let s1 = state[1].dup();
-    let s2 = state[2].dup();
-    let s3 = state[3].dup();
-    let s4 = state[4].dup();
-    let s5 = state[5].dup();
-    let s6 = state[6].dup();
-    let s7 = state[7].dup();
-    let s8 = state[8].dup();
-    let s9 = state[9].dup();
-    let s10 = state[10].dup();
-    let s11 = state[11].dup();
-
-    // V[0] = -2
-    let two_s0 = s0.dup() + s0;
-    state[0] = sum.dup() - two_s0;
-
-    // V[1] = 1
-    state[1] = sum.dup() + s1;
-
-    // V[2] = 2
-    let two_s2 = s2.dup() + s2;
-    state[2] = sum.dup() + two_s2;
-
-    // V[3] = 1/2
-    state[3] = sum.dup() + s3.halve();
-
-    // V[4] = 3
-    let two_s4 = s4.dup() + s4.dup();
-    let three_s4 = two_s4 + s4;
-    state[4] = sum.dup() + three_s4;
-
-    // V[5] = 4
-    let two_s5 = s5.dup() + s5;
-    let four_s5 = two_s5.dup() + two_s5;
-    state[5] = sum.dup() + four_s5;
-
-    // V[6] = -1/2
-    state[6] = sum.dup() - s6.halve();
-
-    // V[7] = -3
-    let two_s7 = s7.dup() + s7.dup();
-    let three_s7 = two_s7 + s7;
-    state[7] = sum.dup() - three_s7;
-
-    // V[8] = -4
-    let two_s8 = s8.dup() + s8;
-    let four_s8 = two_s8.dup() + two_s8;
-    state[8] = sum.dup() - four_s8;
-
-    // V[9] = 1/2^2
-    state[9] = sum.dup() + s9.halve().halve();
-
-    // V[10] = -1/2^2
-    state[10] = sum.dup() - s10.halve().halve();
-
-    // V[11] = 1/2^3
-    state[11] = sum + s11.halve().halve().halve();
-}
-
-fn internal_layer_mat_mul_goldilocks_16<A: Algebra<Goldilocks>>(state: &mut [A; 16]) {
-    let sum: A = state.iter().map(|r| r.dup()).sum();
-
-    let s0 = state[0].dup();
-    let s1 = state[1].dup();
-    let s2 = state[2].dup();
-    let s3 = state[3].dup();
-    let s4 = state[4].dup();
-    let s5 = state[5].dup();
-    let s6 = state[6].dup();
-    let s7 = state[7].dup();
-    let s8 = state[8].dup();
-    let s9 = state[9].dup();
-    let s10 = state[10].dup();
-    let s11 = state[11].dup();
-    let s12 = state[12].dup();
-    let s13 = state[13].dup();
-    let s14 = state[14].dup();
-    let s15 = state[15].dup();
-
-    // V[0] = -2
-    let two_s0 = s0.dup() + s0;
-    state[0] = sum.dup() - two_s0;
-
-    // V[1] = 1
-    state[1] = sum.dup() + s1;
-
-    // V[2] = 2
-    let two_s2 = s2.dup() + s2;
-    state[2] = sum.dup() + two_s2;
-
-    // V[3] = 1/2
-    state[3] = sum.dup() + s3.halve();
-
-    // V[4] = 3
-    let two_s4 = s4.dup() + s4.dup();
-    let three_s4 = two_s4 + s4;
-    state[4] = sum.dup() + three_s4;
-
-    // V[5] = 4
-    let two_s5 = s5.dup() + s5;
-    let four_s5 = two_s5.dup() + two_s5;
-    state[5] = sum.dup() + four_s5;
-
-    // V[6] = -1/2
-    state[6] = sum.dup() - s6.halve();
-
-    // V[7] = -3
-    let two_s7 = s7.dup() + s7.dup();
-    let three_s7 = two_s7 + s7;
-    state[7] = sum.dup() - three_s7;
-
-    // V[8] = -4
-    let two_s8 = s8.dup() + s8;
-    let four_s8 = two_s8.dup() + two_s8;
-    state[8] = sum.dup() - four_s8;
-
-    // V[9] = 1/2^3
-    state[9] = sum.dup() + s9.halve().halve().halve();
-
-    // V[10] = 1/2^4
-    state[10] = sum.dup() + s10.halve().halve().halve().halve();
-
-    // V[11] = 1/2^5
-    state[11] = sum.dup() + s11.halve().halve().halve().halve().halve();
-
-    // V[12] = -1/2^3
-    state[12] = sum.dup() - s12.halve().halve().halve();
-
-    // V[13] = -1/2^4
-    state[13] = sum.dup() - s13.halve().halve().halve().halve();
-
-    // V[14] = -1/2^5
-    state[14] = sum.dup() - s14.halve().halve().halve().halve().halve();
-
-    // V[15] = 1/2^32
-    let inv_2_32 = MATRIX_DIAG_16_GOLDILOCKS[15];
-    let v15 = s15 * inv_2_32;
-    state[15] = sum + v15;
-}
-
-/// The internal layers of the Poseidon2 permutation.
-#[derive(Debug, Clone, Default)]
-pub struct Poseidon2InternalLayerGoldilocks {
-    internal_constants: Vec<Goldilocks>,
-}
-
-impl InternalLayerConstructor<Goldilocks> for Poseidon2InternalLayerGoldilocks {
-    fn new_from_constants(internal_constants: Vec<Goldilocks>) -> Self {
-        Self { internal_constants }
-    }
-}
-
-impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
-    InternalLayer<A, 8, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
-{
-    /// Perform the internal layers of the Poseidon2 permutation on the given state.
-    fn permute_state(&self, state: &mut [A; 8]) {
-        internal_permute_state(
-            state,
-            internal_layer_mat_mul_goldilocks_8,
-            &self.internal_constants,
-        );
-    }
-}
-
-impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
-    InternalLayer<A, 12, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
-{
-    /// Perform the internal layers of the Poseidon2 permutation on the given state.
-    fn permute_state(&self, state: &mut [A; 12]) {
-        internal_permute_state(
-            state,
-            internal_layer_mat_mul_goldilocks_12,
-            &self.internal_constants,
-        );
-    }
-}
-
-impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
-    InternalLayer<A, 16, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
-{
-    /// Perform the internal layers of the Poseidon2 permutation on the given state.
-    fn permute_state(&self, state: &mut [A; 16]) {
-        internal_permute_state(
-            state,
-            internal_layer_mat_mul_goldilocks_16,
-            &self.internal_constants,
-        );
-    }
-}
-
-impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>>
-    InternalLayer<A, 20, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2InternalLayerGoldilocks
-{
-    /// Perform the internal layers of the Poseidon2 permutation on the given state.
-    fn permute_state(&self, state: &mut [A; 20]) {
-        internal_permute_state(
-            state,
-            |x| matmul_internal(x, MATRIX_DIAG_20_GOLDILOCKS),
-            &self.internal_constants,
-        );
-    }
-}
-
-/// The external layers of the Poseidon2 permutation.
-#[derive(Clone)]
-pub struct Poseidon2ExternalLayerGoldilocks<const WIDTH: usize> {
-    pub(crate) external_constants: ExternalLayerConstants<Goldilocks, WIDTH>,
-}
-
-impl<const WIDTH: usize> ExternalLayerConstructor<Goldilocks, WIDTH>
-    for Poseidon2ExternalLayerGoldilocks<WIDTH>
-{
-    fn new_from_constants(external_constants: ExternalLayerConstants<Goldilocks, WIDTH>) -> Self {
-        Self { external_constants }
-    }
-}
-
-impl<A: Algebra<Goldilocks> + InjectiveMonomial<GOLDILOCKS_S_BOX_DEGREE>, const WIDTH: usize>
-    ExternalLayer<A, WIDTH, GOLDILOCKS_S_BOX_DEGREE> for Poseidon2ExternalLayerGoldilocks<WIDTH>
-{
-    /// Perform the initial external layers of the Poseidon2 permutation on the given state.
-    fn permute_state_initial(&self, state: &mut [A; WIDTH]) {
-        external_initial_permute_state(
-            state,
-            self.external_constants.get_initial_constants(),
-            add_rc_and_sbox_generic,
-            &MDSMat4,
-        );
-    }
-
-    /// Perform the terminal external layers of the Poseidon2 permutation on the given state.
-    fn permute_state_terminal(&self, state: &mut [A; WIDTH]) {
-        external_terminal_permute_state(
-            state,
-            self.external_constants.get_terminal_constants(),
-            add_rc_and_sbox_generic,
-            &MDSMat4,
-        );
-    }
-}
-
-/// An implementation of the matrix multiplications in the internal and external layers of Poseidon2.
-///
-/// This can act on `[A; WIDTH]` for any ring implementing `Algebra<Goldilocks>`.
-/// If you have either `[Goldilocks::Packing; WIDTH]` or `[Goldilocks; WIDTH]` it will be much faster
-/// to use `Poseidon2Goldilocks<WIDTH>` instead of building a Poseidon2 permutation using this.
-#[derive(Clone, Debug, Default)]
-pub struct GenericPoseidon2LinearLayersGoldilocks;
-
-impl GenericPoseidon2LinearLayers<8> for GenericPoseidon2LinearLayersGoldilocks {
-    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 8]) {
-        let sum: R = state.iter().map(|r| r.dup()).sum();
-        for i in 0..8 {
-            let d = R::from_u64(MATRIX_DIAG_8_GOLDILOCKS[i].value);
-            state[i] *= d;
-            state[i] += sum.dup();
-        }
-    }
-}
-
-impl GenericPoseidon2LinearLayers<12> for GenericPoseidon2LinearLayersGoldilocks {
-    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 12]) {
-        let sum: R = state.iter().map(|r| r.dup()).sum();
-        for i in 0..12 {
-            let d = R::from_u64(MATRIX_DIAG_12_GOLDILOCKS[i].value);
-            state[i] *= d;
-            state[i] += sum.dup();
-        }
-    }
-}
-
-impl GenericPoseidon2LinearLayers<16> for GenericPoseidon2LinearLayersGoldilocks {
-    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 16]) {
-        let sum: R = state.iter().map(|r| r.dup()).sum();
-        for i in 0..16 {
-            let d = R::from_u64(MATRIX_DIAG_16_GOLDILOCKS[i].value);
-            state[i] *= d;
-            state[i] += sum.dup();
-        }
-    }
-}
-
-impl GenericPoseidon2LinearLayers<20> for GenericPoseidon2LinearLayersGoldilocks {
-    fn internal_linear_layer<R: PrimeCharacteristicRing>(state: &mut [R; 20]) {
-        let sum: R = state.iter().map(|r| r.dup()).sum();
-        for i in 0..20 {
-            let d = R::from_u64(MATRIX_DIAG_20_GOLDILOCKS[i].value);
-            state[i] *= d;
-            state[i] += sum.dup();
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field::PrimeCharacteristicRing;
-    use p3_symmetric::Permutation;
-
-    use super::*;
-
-    type F = Goldilocks;
-
-    #[test]
-    fn test_generic_internal_linear_layer_8_matches_matmul_internal() {
-        let mut state_generic = [
-            F::from_u64(1),
-            F::from_u64(2),
-            F::from_u64(3),
-            F::from_u64(4),
-            F::from_u64(5),
-            F::from_u64(6),
-            F::from_u64(7),
-            F::from_u64(8),
-        ];
-        let mut state_existing = state_generic;
-
-        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
-        matmul_internal(&mut state_existing, MATRIX_DIAG_8_GOLDILOCKS);
-
-        assert_eq!(state_generic, state_existing);
-    }
-
-    #[test]
-    fn test_generic_internal_linear_layer_12_matches_matmul_internal() {
-        let mut state_generic = [
-            F::from_u64(1),
-            F::from_u64(2),
-            F::from_u64(3),
-            F::from_u64(4),
-            F::from_u64(5),
-            F::from_u64(6),
-            F::from_u64(7),
-            F::from_u64(8),
-            F::from_u64(9),
-            F::from_u64(10),
-            F::from_u64(11),
-            F::from_u64(12),
-        ];
-        let mut state_existing = state_generic;
-
-        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
-        matmul_internal(&mut state_existing, MATRIX_DIAG_12_GOLDILOCKS);
-
-        assert_eq!(state_generic, state_existing);
-    }
-
-    #[test]
-    fn test_generic_internal_linear_layer_16_matches_matmul_internal() {
-        let mut state_generic = [
-            F::from_u64(1),
-            F::from_u64(2),
-            F::from_u64(3),
-            F::from_u64(4),
-            F::from_u64(5),
-            F::from_u64(6),
-            F::from_u64(7),
-            F::from_u64(8),
-            F::from_u64(9),
-            F::from_u64(10),
-            F::from_u64(11),
-            F::from_u64(12),
-            F::from_u64(13),
-            F::from_u64(14),
-            F::from_u64(15),
-            F::from_u64(16),
-        ];
-        let mut state_existing = state_generic;
-
-        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
-        matmul_internal(&mut state_existing, MATRIX_DIAG_16_GOLDILOCKS);
-
-        assert_eq!(state_generic, state_existing);
-    }
-
-    #[test]
-    fn test_generic_internal_linear_layer_20_matches_matmul_internal() {
-        let mut state_generic = [
-            F::from_u64(1),
-            F::from_u64(2),
-            F::from_u64(3),
-            F::from_u64(4),
-            F::from_u64(5),
-            F::from_u64(6),
-            F::from_u64(7),
-            F::from_u64(8),
-            F::from_u64(9),
-            F::from_u64(10),
-            F::from_u64(11),
-            F::from_u64(12),
-            F::from_u64(13),
-            F::from_u64(14),
-            F::from_u64(15),
-            F::from_u64(16),
-            F::from_u64(17),
-            F::from_u64(18),
-            F::from_u64(19),
-            F::from_u64(20),
-        ];
-        let mut state_existing = state_generic;
-
-        GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic);
-        matmul_internal(&mut state_existing, MATRIX_DIAG_20_GOLDILOCKS);
-
-        assert_eq!(state_generic, state_existing);
-    }
-
-    #[test]
-    fn test_default_goldilocks_poseidon2_width_8() {
-        let mut input: [F; 8] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7]);
-
-        let expected: [F; 8] = Goldilocks::new_array([
-            0x020cf04a1b214d14,
-            0x84e14aaaeacaed25,
-            0x1ae0f640e81c7457,
-            0xa4d204cbaeb0d8a5,
-            0x0cf637b627b3a7ff,
-            0x788d304d948b486b,
-            0x7327133ea1949af4,
-            0xf415abb924da395b,
-        ]);
-
-        let perm = default_goldilocks_poseidon2_8();
-        perm.permute_mut(&mut input);
-
-        assert_eq!(input, expected);
-    }
-
-    #[test]
-    fn test_default_goldilocks_poseidon2_width_12() {
-        let mut input: [F; 12] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
-
-        let expected: [F; 12] = Goldilocks::new_array([
-            0xf292ab67c0f14b03,
-            0x0a32f1b37656544c,
-            0x053c61ab895498de,
-            0x02ff92e55b196ffb,
-            0x58176e8f6f58cab2,
-            0xb0aa1206e7aec0f8,
-            0xe90c13f3dce83ca4,
-            0xf4da15333edf39c2,
-            0x23b701c053c2ca6c,
-            0xd233d593dcdfbf58,
-            0x4effa5f9516fb52e,
-            0x0aaf4489f1f40166,
-        ]);
-
-        let perm = default_goldilocks_poseidon2_12();
-        perm.permute_mut(&mut input);
-
-        assert_eq!(input, expected);
-    }
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs
deleted file mode 100644
index 44fe4fa3f..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs
+++ /dev/null
@@ -1,86 +0,0 @@
-use p3_mds::MdsPermutation;
-use p3_mds::util::apply_circulant;
-use p3_symmetric::Permutation;
-
-use crate::x86_64_avx2::packing::PackedGoldilocksAVX2;
-use crate::{
-    MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW,
-    MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks,
-};
-const fn convert_array<const N: usize>(arr: [i64; N]) -> [u64; N] {
-    let mut result: [u64; N] = [0; N];
-    let mut i = 0;
-    while i < N {
-        result[i] = arr[i] as u64;
-        i += 1;
-    }
-    result
-}
-
-impl Permutation<[PackedGoldilocksAVX2; 8]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX2; 8]) -> [PackedGoldilocksAVX2; 8] {
-        const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW);
-        apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX2, 8> for MdsMatrixGoldilocks {}
-
-impl Permutation<[PackedGoldilocksAVX2; 12]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX2; 12]) -> [PackedGoldilocksAVX2; 12] {
-        const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW);
-        apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX2, 12> for MdsMatrixGoldilocks {}
-
-impl Permutation<[PackedGoldilocksAVX2; 16]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX2; 16]) -> [PackedGoldilocksAVX2; 16] {
-        const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW);
-        apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX2, 16> for MdsMatrixGoldilocks {}
-
-impl Permutation<[PackedGoldilocksAVX2; 24]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX2; 24]) -> [PackedGoldilocksAVX2; 24] {
-        apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX2, 24> for MdsMatrixGoldilocks {}
-
-#[cfg(test)]
-mod tests {
-    use p3_symmetric::Permutation;
-    use rand::rngs::SmallRng;
-    use rand::{RngExt, SeedableRng};
-
-    use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX2};
-
-    macro_rules! test_avx2_mds {
-        ($name:ident, $width:literal) => {
-            #[test]
-            fn $name() {
-                let mut rng = SmallRng::seed_from_u64(1);
-                let mds = MdsMatrixGoldilocks;
-
-                let input: [Goldilocks; $width] = rng.random();
-                let expected = mds.permute(input);
-
-                let packed_input = input.map(Into::<PackedGoldilocksAVX2>::into);
-                let packed_output = mds.permute(packed_input);
-
-                let avx2_output = packed_output.map(|x| x.0[0]);
-                assert_eq!(avx2_output, expected);
-            }
-        };
-    }
-
-    test_avx2_mds!(test_avx2_mds_width_8, 8);
-    test_avx2_mds!(test_avx2_mds_width_12, 12);
-    test_avx2_mds!(test_avx2_mds_width_16, 16);
-    test_avx2_mds!(test_avx2_mds_width_24, 24);
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs
deleted file mode 100644
index 09300a20f..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs
+++ /dev/null
@@ -1,3 +0,0 @@
-mod mds;
-mod packing;
-pub use packing::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs
deleted file mode 100644
index 217a2b2e0..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs
+++ /dev/null
@@ -1,539 +0,0 @@
-use alloc::vec::Vec;
-use core::arch::x86_64::*;
-use core::fmt::Debug;
-use core::iter::{Product, Sum};
-use core::mem::transmute;
-use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
-
-use p3_field::exponentiation::exp_10540996611094048183;
-use p3_field::interleave::{interleave_u64, interleave_u128};
-use p3_field::op_assign_macros::{
-    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods,
-    impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field,
-    ring_sum,
-};
-use p3_field::{
-    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue,
-    PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2,
-};
-use p3_util::reconstitute_from_base;
-use rand::distr::{Distribution, StandardUniform};
-use rand::{Rng, RngExt};
-
-use crate::{Goldilocks, P};
-
-const WIDTH: usize = 4;
-
-/// Vectorized AVX2 implementation of `Goldilocks` arithmetic.
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
-#[repr(transparent)] // Needed to make `transmute`s safe.
-#[must_use]
-pub struct PackedGoldilocksAVX2(pub [Goldilocks; WIDTH]);
-
-impl PackedGoldilocksAVX2 {
-    /// Get an arch-specific vector representing the packed values.
-    #[inline]
-    #[must_use]
-    pub(crate) fn to_vector(self) -> __m256i {
-        unsafe {
-            // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It
-            // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be
-            // transmuted to `__m256i`, since arrays are guaranteed to be contiguous in memory.
-            // Finally `PackedGoldilocksAVX2` is `repr(transparent)` so it can be transmuted to
-            // `[Goldilocks; WIDTH]`.
-            transmute(self)
-        }
-    }
-
-    /// Make a packed field vector from an arch-specific vector.
-    ///
-    /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function
-    /// is safe unlike the `Mersenne31/MontyField31` variants.
-    #[inline]
-    pub(crate) fn from_vector(vector: __m256i) -> Self {
-        unsafe {
-            // Safety: `__m256i` can be transmuted to `[u64; WIDTH]` (since arrays elements are
-            // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since
-            // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to
-            // `PackedGoldilocksAVX2` (since `PackedGoldilocksAVX2` is also `repr(transparent)`).
-            transmute(vector)
-        }
-    }
-
-    /// Copy `value` to all positions in a packed vector. This is the same as
-    /// `From<Goldilocks>::from`, but `const`.
-    #[inline]
-    const fn broadcast(value: Goldilocks) -> Self {
-        Self([value; WIDTH])
-    }
-}
-
-impl From<Goldilocks> for PackedGoldilocksAVX2 {
-    fn from(x: Goldilocks) -> Self {
-        Self::broadcast(x)
-    }
-}
-
-impl Add for PackedGoldilocksAVX2 {
-    type Output = Self;
-    #[inline]
-    fn add(self, rhs: Self) -> Self {
-        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl Sub for PackedGoldilocksAVX2 {
-    type Output = Self;
-    #[inline]
-    fn sub(self, rhs: Self) -> Self {
-        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl Neg for PackedGoldilocksAVX2 {
-    type Output = Self;
-    #[inline]
-    fn neg(self) -> Self {
-        Self::from_vector(neg(self.to_vector()))
-    }
-}
-
-impl Mul for PackedGoldilocksAVX2 {
-    type Output = Self;
-    #[inline]
-    fn mul(self, rhs: Self) -> Self {
-        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl_add_assign!(PackedGoldilocksAVX2);
-impl_sub_assign!(PackedGoldilocksAVX2);
-impl_mul_methods!(PackedGoldilocksAVX2);
-ring_sum!(PackedGoldilocksAVX2);
-impl_rng!(PackedGoldilocksAVX2);
-
-impl PrimeCharacteristicRing for PackedGoldilocksAVX2 {
-    type PrimeSubfield = Goldilocks;
-
-    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
-    const ONE: Self = Self::broadcast(Goldilocks::ONE);
-    const TWO: Self = Self::broadcast(Goldilocks::TWO);
-    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
-
-    #[inline]
-    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
-        f.into()
-    }
-
-    #[inline]
-    fn halve(&self) -> Self {
-        Self::from_vector(halve(self.to_vector()))
-    }
-
-    #[inline]
-    fn square(&self) -> Self {
-        Self::from_vector(square(self.to_vector()))
-    }
-
-    #[inline]
-    fn zero_vec(len: usize) -> Vec<Self> {
-        // SAFETY: this is a repr(transparent) wrapper around an array.
-        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
-    }
-}
-
-// Degree of the smallest permutation polynomial for Goldilocks.
-//
-// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
-impl InjectiveMonomial<7> for PackedGoldilocksAVX2 {}
-
-impl PermutationMonomial<7> for PackedGoldilocksAVX2 {
-    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
-    ///
-    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
-    fn injective_exp_root_n(&self) -> Self {
-        exp_10540996611094048183(*self)
-    }
-}
-
-impl_add_base_field!(PackedGoldilocksAVX2, Goldilocks);
-impl_sub_base_field!(PackedGoldilocksAVX2, Goldilocks);
-impl_mul_base_field!(PackedGoldilocksAVX2, Goldilocks);
-impl_div_methods!(PackedGoldilocksAVX2, Goldilocks);
-impl_sum_prod_base_field!(PackedGoldilocksAVX2, Goldilocks);
-
-impl Algebra<Goldilocks> for PackedGoldilocksAVX2 {
-    // Benchmarked on AVX2: chunk=32 ≈ 226ns, chunk=2 ≈ 228ns, chunk=16 ≈ 229ns.
-    const BATCHED_LC_CHUNK: usize = 32;
-}
-
-impl_packed_value!(PackedGoldilocksAVX2, Goldilocks, WIDTH);
-
-unsafe impl PackedField for PackedGoldilocksAVX2 {
-    type Scalar = Goldilocks;
-}
-
-impl_packed_field_pow_2!(
-    PackedGoldilocksAVX2;
-    [
-        (1, interleave_u64),
-        (2, interleave_u128),
-    ],
-    WIDTH
-);
-
-// Resources:
-// 1. Intel Intrinsics Guide for explanation of each intrinsic:
-//    https://software.intel.com/sites/landingpage/IntrinsicsGuide/
-// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html
-// 3. Intel optimization manual for introduction to x86 vector extensions and best practices:
-//    https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html
-
-// Preliminary knowledge:
-// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with
-//    _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a
-//    vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper.
-//
-// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily
-//    emulated. The method recognizes that for a + b overflowed iff (a + b) < a:
-//        i. res_lo = a_lo + b_lo
-//       ii. carry_mask = res_lo < a_lo
-//      iii. res_hi = a_hi + b_hi - carry_mask
-//    Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions
-//    return -1 (all bits 1) for true and 0 for false.
-//
-// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons
-//    by recognizing that a <u b iff a + (1 << 63) <s b + (1 << 63), where the addition wraps around
-//    and the comparisons are unsigned and signed respectively. The shift function adds/subtracts
-//    1 << 63 to enable this trick.
-//      Example: addition with carry.
-//        i. a_lo_s = shift(a_lo)
-//       ii. res_lo_s = a_lo_s + b_lo
-//      iii. carry_mask = res_lo_s <s a_lo_s
-//       iv. res_lo = shift(res_lo_s)
-//        v. res_hi = a_hi + b_hi - carry_mask
-//    The suffix _s denotes a value that has been shifted by 1 << 63. The result of addition is
-//    shifted if exactly one of the operands is shifted, as is the case on line ii. Line iii.
-//    performs a signed comparison res_lo_s <s a_lo_s on shifted values to emulate unsigned
-//    comparison res_lo <u a_lo on unshifted values. Finally, line iv. reverses the shift so the
-//    result can be returned.
-//      When performing a chain of calculations, we can often save instructions by letting the shift
-//    propagate through and only undoing it when necessary. For example, to compute the addition of
-//    three two-word (128-bit) numbers we can do:
-//        i. a_lo_s = shift(a_lo)
-//       ii. tmp_lo_s = a_lo_s + b_lo
-//      iii. tmp_carry_mask = tmp_lo_s <s a_lo_s
-//       iv. tmp_hi = a_hi + b_hi - tmp_carry_mask
-//        v. res_lo_s = tmp_lo_s + c_lo
-//       vi. res_carry_mask = res_lo_s <s tmp_lo_s
-//      vii. res_lo = shift(res_lo_s)
-//     viii. res_hi = tmp_hi + c_hi - res_carry_mask
-//    Notice that the above 3-value addition still only requires two calls to shift, just like our
-//    2-value addition.
-
-const SIGN_BIT: __m256i = unsafe { transmute([i64::MIN; WIDTH]) };
-const SHIFTED_FIELD_ORDER: __m256i =
-    unsafe { transmute([Goldilocks::ORDER_U64 ^ (i64::MIN as u64); WIDTH]) };
-
-/// Equal to 2^32 - 1 = 2^64 mod P.
-const EPSILON: __m256i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) };
-
-/// Add 2^63 with overflow. Needed to emulate unsigned comparisons (see point 3. in
-/// packed_prime_field.rs).
-#[inline]
-pub fn shift(x: __m256i) -> __m256i {
-    unsafe { _mm256_xor_si256(x, SIGN_BIT) }
-}
-
-/// Convert to canonical representation.
-/// The argument is assumed to be shifted by 1 << 63 (i.e. x_s = x + 1<<63, where x is the field
-///   value). The returned value is similarly shifted by 1 << 63 (i.e. we return y_s = y + (1<<63),
-///   where 0 <= y < FIELD_ORDER).
-#[inline]
-unsafe fn canonicalize_s(x_s: __m256i) -> __m256i {
-    unsafe {
-        // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1.
-        let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s);
-        // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0.
-        let wrapback_amt = _mm256_andnot_si256(mask, EPSILON);
-        _mm256_add_epi64(x_s, wrapback_amt)
-    }
-}
-
-/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is
-/// pre-shifted by 1 << 63. The result is similarly shifted.
-#[inline]
-unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i {
-    unsafe {
-        let res_wrapped_s = _mm256_add_epi64(x, y_s);
-        let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0.
-        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
-        _mm256_add_epi64(res_wrapped_s, wrapback_amt)
-    }
-}
-
-/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`.
-///
-/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn add(x: __m256i, y: __m256i) -> __m256i {
-    unsafe {
-        let y_s = shift(y);
-        let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s));
-        shift(res_s)
-    }
-}
-
-/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`.
-///
-/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn sub(x: __m256i, y: __m256i) -> __m256i {
-    unsafe {
-        let mut y_s = shift(y);
-        y_s = canonicalize_s(y_s);
-        let x_s = shift(x);
-        let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0.
-        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0.
-        let res_wrapped = _mm256_sub_epi64(x_s, y_s);
-        _mm256_sub_epi64(res_wrapped, wrapback_amt)
-    }
-}
-
-/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`.
-///
-/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn neg(y: __m256i) -> __m256i {
-    unsafe {
-        let y_s = shift(y);
-        _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s))
-    }
-}
-
-/// Halve a vector of Goldilocks field elements.
-#[inline(always)]
-pub(crate) fn halve(input: __m256i) -> __m256i {
-    /*
-        We want this to compile to:
-            vpand    least_bit, val, ONE
-            vpsrlq   t, val, 1
-            vpsubq   neg_least_bit, ZERO, least_bit
-            vpand    maybe_half, HALF, neg_least_bit
-            vpaddq   res, t, maybe_half
-        throughput: 1.67 cyc/vec
-        latency: 4 cyc
-
-        Given an element val in [0, P), we want to compute val/2 mod P.
-        If val is even: val/2 mod P = val/2 = val >> 1.
-        If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2
-    */
-    unsafe {
-        // Safety: If this code got compiled then AVX2 intrinsics are available.
-        const ONE: __m256i = unsafe { transmute([1_i64; 4]) };
-        const ZERO: __m256i = unsafe { transmute([0_i64; 4]) };
-        let half = _mm256_set1_epi64x(P.div_ceil(2) as i64); // Compiler should realise this is constant.
-
-        let least_bit = _mm256_and_si256(input, ONE); // Determine the parity of val.
-        let t = _mm256_srli_epi64::<1>(input);
-
-        // Negate the least bit giving us either 0 (all bits 0) or -1 (all bits 1).
-        // It would be better to use vpsignq but this instruction does not exist.
-        let neg_least_bit = _mm256_sub_epi64(ZERO, least_bit);
-
-        // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0
-        let maybe_half = _mm256_and_si256(half, neg_least_bit);
-        _mm256_add_epi64(t, maybe_half)
-    }
-}
-
-/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the
-/// scalar instruction, but may be worth it if we want our data to live in vector registers.
-#[inline]
-fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) {
-    unsafe {
-        // We want to move the high 32 bits to the low position. The multiplication instruction ignores
-        // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can
-        // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication.
-        //   This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the
-        // distinction; the casts are free and it guarantees that the exact bit pattern is preserved.
-        // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency
-        // since Haswell.
-        let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
-        let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y)));
-
-        // All four pairwise multiplications
-        let mul_ll = _mm256_mul_epu32(x, y);
-        let mul_lh = _mm256_mul_epu32(x, y_hi);
-        let mul_hl = _mm256_mul_epu32(x_hi, y);
-        let mul_hh = _mm256_mul_epu32(x_hi, y_hi);
-
-        // Bignum addition
-        // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow.
-        let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll);
-        let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi);
-        // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow.
-        // Also, extract high 32 bits of t0 and add to mul_hh.
-        let t0_lo = _mm256_and_si256(t0, EPSILON);
-        let t0_hi = _mm256_srli_epi64::<32>(t0);
-        let t1 = _mm256_add_epi64(mul_lh, t0_lo);
-        let t2 = _mm256_add_epi64(mul_hh, t0_hi);
-        // Lastly, extract the high 32 bits of t1 and add to t2.
-        let t1_hi = _mm256_srli_epi64::<32>(t1);
-        let res_hi = _mm256_add_epi64(t2, t1_hi);
-
-        // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high
-        // position).
-        let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1)));
-        let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo);
-
-        (res_hi, res_lo)
-    }
-}
-
-/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction.
-#[inline]
-fn square64(x: __m256i) -> (__m256i, __m256i) {
-    unsafe {
-        // Get high 32 bits of x. See comment in mul64_64_s.
-        let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
-
-        // All pairwise multiplications.
-        let mul_ll = _mm256_mul_epu32(x, x);
-        let mul_lh = _mm256_mul_epu32(x, x_hi);
-        let mul_hh = _mm256_mul_epu32(x_hi, x_hi);
-
-        // Bignum addition, but mul_lh is shifted by 33 bits (not 32).
-        let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll);
-        let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi);
-        let t0_hi = _mm256_srli_epi64::<31>(t0);
-        let res_hi = _mm256_add_epi64(mul_hh, t0_hi);
-
-        // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high
-        // position).
-        let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh);
-        let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo);
-
-        (res_hi, res_lo)
-    }
-}
-
-/// Goldilocks addition of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be
-/// `<= 2^64 - 2^32 = 0xffffffff00000000`. The result is shifted by 2**63.
-#[inline]
-unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
-    unsafe {
-        let res_wrapped_s = _mm256_add_epi64(x_s, y);
-        // 32-bit compare is faster than 64-bit. It's safe as long as x > res_wrapped iff x >> 32 >
-        // res_wrapped >> 32. The case of x >> 32 > res_wrapped >> 32 is trivial and so is <. The case
-        // where x >> 32 = res_wrapped >> 32 remains. If x >> 32 = res_wrapped >> 32, then y >> 32 =
-        // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y
-        // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur.
-        let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0.
-        // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise.
-        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
-        _mm256_add_epi64(res_wrapped_s, wrapback_amt)
-    }
-}
-
-/// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be
-/// <= `0xffffffff00000000`. The result is shifted by 2**63.
-#[inline]
-unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
-    unsafe {
-        let res_wrapped_s = _mm256_sub_epi64(x_s, y);
-        // 32-bit compare is faster than 64-bit. It's safe as long as res_wrapped > x iff res_wrapped >>
-        // 32 > x >> 32. The case of res_wrapped >> 32 > x >> 32 is trivial and so is <. The case where
-        // res_wrapped >> 32 = x >> 32 remains. If res_wrapped >> 32 = x >> 32, then y >> 32 =
-        // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if
-        // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur.
-        let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0.
-        // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise.
-        let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0.
-        _mm256_sub_epi64(res_wrapped_s, wrapback_amt)
-    }
-}
-
-/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order.
-///
-/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`.
-#[inline]
-fn reduce128(x: (__m256i, __m256i)) -> __m256i {
-    unsafe {
-        let (hi0, lo0) = x;
-
-        // First we shift lo0 to lo0_s = lo0 + 2^{63} mod 2^64
-        // This lets us emulate unsigned comparisons
-        let lo0_s = shift(lo0);
-
-        // Get the top 32 bits of hi_hi0.
-        let hi_hi0 = _mm256_srli_epi64::<32>(hi0);
-
-        // Computes lo0_s - hi_hi0 mod FIELD_ORDER.
-        // Makes sense to do as 2^96 = -1 mod FIELD_ORDER.
-        // sub_small_64s_64_s is safe to use as `hi_hi0 < 2^32`.
-        let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0);
-
-        // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER
-        // _mm256_mul_epu32 ignores the top 32 bits so just use that.
-        let t1 = _mm256_mul_epu32(hi0, EPSILON);
-
-        // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 so we can use `add_small_64s_64_s` to get
-        // `lo2_s = lo1_s + t1 mod FIELD_ORDER.`
-        let lo2_s = add_small_64s_64_s(lo1_s, t1);
-
-        // Finally just need to correct for the shift.
-        shift(lo2_s)
-    }
-}
-
-/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`.
-///
-/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn mul(x: __m256i, y: __m256i) -> __m256i {
-    reduce128(mul64_64(x, y))
-}
-
-/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`.
-///
-/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn square(x: __m256i) -> __m256i {
-    reduce128(square64(x))
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field_testing::test_packed_field;
-
-    use super::{Goldilocks, PackedGoldilocksAVX2, WIDTH};
-
-    const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([
-        0xFFFF_FFFF_0000_0000,
-        0xFFFF_FFFF_FFFF_FFFF,
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0001,
-    ]);
-
-    const ZEROS: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([
-        0x0000_0000_0000_0000,
-        0xFFFF_FFFF_0000_0001,
-        0x0000_0000_0000_0000,
-        0xFFFF_FFFF_0000_0001,
-    ]));
-
-    const ONES: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0002,
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0002,
-    ]));
-
-    test_packed_field!(
-        crate::PackedGoldilocksAVX2,
-        &[super::ZEROS],
-        &[super::ONES],
-        crate::PackedGoldilocksAVX2(super::SPECIAL_VALS)
-    );
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs
deleted file mode 100644
index f4d6c9f71..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs
+++ /dev/null
@@ -1,86 +0,0 @@
-use p3_mds::MdsPermutation;
-use p3_mds::util::apply_circulant;
-use p3_symmetric::Permutation;
-
-use crate::x86_64_avx512::packing::PackedGoldilocksAVX512;
-use crate::{
-    MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW,
-    MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks,
-};
-const fn convert_array<const N: usize>(arr: [i64; N]) -> [u64; N] {
-    let mut result: [u64; N] = [0; N];
-    let mut i = 0;
-    while i < N {
-        result[i] = arr[i] as u64;
-        i += 1;
-    }
-    result
-}
-
-impl Permutation<[PackedGoldilocksAVX512; 8]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX512; 8]) -> [PackedGoldilocksAVX512; 8] {
-        const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW);
-        apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX512, 8> for MdsMatrixGoldilocks {}
-
-impl Permutation<[PackedGoldilocksAVX512; 12]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX512; 12]) -> [PackedGoldilocksAVX512; 12] {
-        const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW);
-        apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX512, 12> for MdsMatrixGoldilocks {}
-
-impl Permutation<[PackedGoldilocksAVX512; 16]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX512; 16]) -> [PackedGoldilocksAVX512; 16] {
-        const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW);
-        apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX512, 16> for MdsMatrixGoldilocks {}
-
-impl Permutation<[PackedGoldilocksAVX512; 24]> for MdsMatrixGoldilocks {
-    fn permute(&self, input: [PackedGoldilocksAVX512; 24]) -> [PackedGoldilocksAVX512; 24] {
-        apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input)
-    }
-}
-
-impl MdsPermutation<PackedGoldilocksAVX512, 24> for MdsMatrixGoldilocks {}
-
-#[cfg(test)]
-mod tests {
-    use p3_symmetric::Permutation;
-    use rand::rngs::SmallRng;
-    use rand::{RngExt, SeedableRng};
-
-    use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX512};
-
-    macro_rules! test_avx512_mds {
-        ($name:ident, $width:literal) => {
-            #[test]
-            fn $name() {
-                let mut rng = SmallRng::seed_from_u64(1);
-                let mds = MdsMatrixGoldilocks;
-
-                let input: [Goldilocks; $width] = rng.random();
-                let expected = mds.permute(input);
-
-                let packed_input = input.map(Into::<PackedGoldilocksAVX512>::into);
-                let packed_output = mds.permute(packed_input);
-
-                let avx512_output = packed_output.map(|x| x.0[0]);
-                assert_eq!(avx512_output, expected);
-            }
-        };
-    }
-
-    test_avx512_mds!(test_avx512_mds_width_8, 8);
-    test_avx512_mds!(test_avx512_mds_width_12, 12);
-    test_avx512_mds!(test_avx512_mds_width_16, 16);
-    test_avx512_mds!(test_avx512_mds_width_24, 24);
-}
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs
deleted file mode 100644
index 09300a20f..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs
+++ /dev/null
@@ -1,3 +0,0 @@
-mod mds;
-mod packing;
-pub use packing::*;
diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs
deleted file mode 100644
index 0c751b436..000000000
--- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs
+++ /dev/null
@@ -1,444 +0,0 @@
-use alloc::vec::Vec;
-use core::arch::x86_64::*;
-use core::fmt::Debug;
-use core::iter::{Product, Sum};
-use core::mem::transmute;
-use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
-
-use p3_field::exponentiation::exp_10540996611094048183;
-use p3_field::interleave::{interleave_u64, interleave_u128, interleave_u256};
-use p3_field::op_assign_macros::{
-    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods,
-    impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field,
-    ring_sum,
-};
-use p3_field::{
-    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue,
-    PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2,
-};
-use p3_util::reconstitute_from_base;
-use rand::distr::{Distribution, StandardUniform};
-use rand::{Rng, RngExt};
-
-use crate::{Goldilocks, P};
-
-const WIDTH: usize = 8;
-
-/// Vectorized AVX512 implementation of `Goldilocks` arithmetic.
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
-#[repr(transparent)] // Needed to make `transmute`s safe.
-#[must_use]
-pub struct PackedGoldilocksAVX512(pub [Goldilocks; WIDTH]);
-
-impl PackedGoldilocksAVX512 {
-    /// Get an arch-specific vector representing the packed values.
-    #[inline]
-    #[must_use]
-    pub(crate) fn to_vector(self) -> __m512i {
-        unsafe {
-            // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It
-            // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be
-            // transmuted to `__m512i`, since arrays are guaranteed to be contiguous in memory.
-            // Finally `PackedGoldilocksAVX512` is `repr(transparent)` so it can be transmuted to
-            // `[Goldilocks; WIDTH]`.
-            transmute(self)
-        }
-    }
-
-    /// Make a packed field vector from an arch-specific vector.
-    ///
-    /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function
-    /// is safe unlike the `Mersenne31/MontyField31` variants.
-    #[inline]
-    pub(crate) fn from_vector(vector: __m512i) -> Self {
-        unsafe {
-            // Safety: `__m512i` can be transmuted to `[u64; WIDTH]` (since arrays elements are
-            // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since
-            // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to
-            // `PackedGoldilocksAVX512` (since `PackedGoldilocksAVX512` is also `repr(transparent)`).
-            transmute(vector)
-        }
-    }
-
-    /// Copy `value` to all positions in a packed vector. This is the same as
-    /// `From<Goldilocks>::from`, but `const`.
-    #[inline]
-    const fn broadcast(value: Goldilocks) -> Self {
-        Self([value; WIDTH])
-    }
-}
-
-impl From<Goldilocks> for PackedGoldilocksAVX512 {
-    fn from(x: Goldilocks) -> Self {
-        Self::broadcast(x)
-    }
-}
-
-impl Add for PackedGoldilocksAVX512 {
-    type Output = Self;
-    #[inline]
-    fn add(self, rhs: Self) -> Self {
-        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl Sub for PackedGoldilocksAVX512 {
-    type Output = Self;
-    #[inline]
-    fn sub(self, rhs: Self) -> Self {
-        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl Neg for PackedGoldilocksAVX512 {
-    type Output = Self;
-    #[inline]
-    fn neg(self) -> Self {
-        Self::from_vector(neg(self.to_vector()))
-    }
-}
-
-impl Mul for PackedGoldilocksAVX512 {
-    type Output = Self;
-    #[inline]
-    fn mul(self, rhs: Self) -> Self {
-        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
-    }
-}
-
-impl_add_assign!(PackedGoldilocksAVX512);
-impl_sub_assign!(PackedGoldilocksAVX512);
-impl_mul_methods!(PackedGoldilocksAVX512);
-ring_sum!(PackedGoldilocksAVX512);
-impl_rng!(PackedGoldilocksAVX512);
-
-impl PrimeCharacteristicRing for PackedGoldilocksAVX512 {
-    type PrimeSubfield = Goldilocks;
-
-    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
-    const ONE: Self = Self::broadcast(Goldilocks::ONE);
-    const TWO: Self = Self::broadcast(Goldilocks::TWO);
-    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
-
-    #[inline]
-    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
-        f.into()
-    }
-
-    #[inline]
-    fn halve(&self) -> Self {
-        Self::from_vector(halve(self.to_vector()))
-    }
-
-    #[inline]
-    fn square(&self) -> Self {
-        Self::from_vector(square(self.to_vector()))
-    }
-
-    #[inline]
-    fn zero_vec(len: usize) -> Vec<Self> {
-        // SAFETY: this is a repr(transparent) wrapper around an array.
-        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
-    }
-}
-
-impl_add_base_field!(PackedGoldilocksAVX512, Goldilocks);
-impl_sub_base_field!(PackedGoldilocksAVX512, Goldilocks);
-impl_mul_base_field!(PackedGoldilocksAVX512, Goldilocks);
-impl_div_methods!(PackedGoldilocksAVX512, Goldilocks);
-impl_sum_prod_base_field!(PackedGoldilocksAVX512, Goldilocks);
-
-impl Algebra<Goldilocks> for PackedGoldilocksAVX512 {
-    // Benchmarked on AVX-512: chunk=4 ≈ 198ns, chunk=2 ≈ 198ns, chunk=32 ≈ 199ns.
-    const BATCHED_LC_CHUNK: usize = 4;
-}
-
-// Degree of the smallest permutation polynomial for Goldilocks.
-//
-// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
-impl InjectiveMonomial<7> for PackedGoldilocksAVX512 {}
-
-impl PermutationMonomial<7> for PackedGoldilocksAVX512 {
-    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
-    ///
-    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
-    fn injective_exp_root_n(&self) -> Self {
-        exp_10540996611094048183(*self)
-    }
-}
-
-impl_packed_value!(PackedGoldilocksAVX512, Goldilocks, WIDTH);
-
-unsafe impl PackedField for PackedGoldilocksAVX512 {
-    type Scalar = Goldilocks;
-}
-
-impl_packed_field_pow_2!(
-    PackedGoldilocksAVX512;
-    [
-        (1, interleave_u64),
-        (2, interleave_u128),
-        (4, interleave_u256),
-    ],
-    WIDTH
-);
-
-const FIELD_ORDER: __m512i = unsafe { transmute([Goldilocks::ORDER_U64; WIDTH]) };
-const EPSILON: __m512i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) };
-
-#[inline]
-unsafe fn canonicalize(x: __m512i) -> __m512i {
-    unsafe {
-        let mask = _mm512_cmpge_epu64_mask(x, FIELD_ORDER);
-        _mm512_mask_sub_epi64(x, mask, x, FIELD_ORDER)
-    }
-}
-
-/// Compute the modular addition `x + y mod FIELD_ORDER`.
-///
-/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider
-/// set of circumstances if bounds on `x` are known.
-///
-/// The result will be a u64 which may be greater than FIELD_ORDER.
-///
-/// Safety:
-///     User must ensure that x + y < 2^64 + FIELD_ORDER.
-#[inline]
-unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
-    unsafe {
-        let res_wrapped = _mm512_add_epi64(x, y);
-        let mask = _mm512_cmplt_epu64_mask(res_wrapped, y); // mask set if add overflowed
-        _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER)
-    }
-}
-
-/// Compute the modular subtraction x - y mod FIELD_ORDER.
-///
-/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider
-/// set of circumstances if bounds on `x` are known.
-///
-/// The result will be a u64 which may be greater than FIELD_ORDER.
-///
-/// Safety:
-///     User must ensure that x - y > -FIELD_ORDER.
-#[inline]
-unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
-    unsafe {
-        let mask = _mm512_cmplt_epu64_mask(x, y); // mask set if sub will underflow (x < y)
-        let res_wrapped = _mm512_sub_epi64(x, y);
-        _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER)
-    }
-}
-
-/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`.
-///
-/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn add(x: __m512i, y: __m512i) -> __m512i {
-    unsafe { add_no_double_overflow_64_64(x, canonicalize(y)) }
-}
-
-/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`.
-///
-/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn sub(x: __m512i, y: __m512i) -> __m512i {
-    unsafe { sub_no_double_overflow_64_64(x, canonicalize(y)) }
-}
-
-/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`.
-///
-/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn neg(y: __m512i) -> __m512i {
-    unsafe { _mm512_sub_epi64(FIELD_ORDER, canonicalize(y)) }
-}
-
-/// Halve a vector of Goldilocks field elements.
-#[inline(always)]
-pub(crate) fn halve(input: __m512i) -> __m512i {
-    /*
-        We want this to compile to:
-            vptestmq  least_bit, val, ONE
-            vpsrlq    res, val, 1
-            vpaddq    res{least_bit}, res, maybe_half
-        throughput: 2 cyc/vec
-        latency: 4 cyc
-
-        Given an element val in [0, P), we want to compute val/2 mod P.
-        If val is even: val/2 mod P = val/2 = val >> 1.
-        If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2
-    */
-    unsafe {
-        // Safety: If this code got compiled then AVX512 intrinsics are available.
-        const ONE: __m512i = unsafe { transmute([1_i64; 8]) };
-        let half = _mm512_set1_epi64(P.div_ceil(2) as i64); // Compiler realises this is constant.
-
-        let least_bit = _mm512_test_epi64_mask(input, ONE); // Determine the parity of val.
-        let t = _mm512_srli_epi64::<1>(input);
-        // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0
-        _mm512_mask_add_epi64(t, least_bit, t, half)
-    }
-}
-
-#[allow(clippy::useless_transmute)]
-const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) };
-
-/// Full 64-bit by 64-bit multiplication.
-#[inline]
-fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
-    unsafe {
-        // We want to move the high 32 bits to the low position. The multiplication instruction ignores
-        // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can
-        // be done on port 5; bitshifts run on port 0, competing with multiplication.
-        //   This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the
-        // distinction; the casts are free and it guarantees that the exact bit pattern is preserved.
-        // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency
-        // since Haswell.
-        let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
-        let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y)));
-
-        // All four pairwise multiplications
-        let mul_ll = _mm512_mul_epu32(x, y);
-        let mul_lh = _mm512_mul_epu32(x, y_hi);
-        let mul_hl = _mm512_mul_epu32(x_hi, y);
-        let mul_hh = _mm512_mul_epu32(x_hi, y_hi);
-
-        // Bignum addition
-        // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow.
-        let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll);
-        let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi);
-        // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow.
-        // Also, extract high 32 bits of t0 and add to mul_hh.
-        let t0_lo = _mm512_and_si512(t0, EPSILON);
-        let t0_hi = _mm512_srli_epi64::<32>(t0);
-        let t1 = _mm512_add_epi64(mul_lh, t0_lo);
-        let t2 = _mm512_add_epi64(mul_hh, t0_hi);
-        // Lastly, extract the high 32 bits of t1 and add to t2.
-        let t1_hi = _mm512_srli_epi64::<32>(t1);
-        let res_hi = _mm512_add_epi64(t2, t1_hi);
-
-        // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high
-        // position).
-        let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1)));
-        let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll);
-
-        (res_hi, res_lo)
-    }
-}
-
-/// Full 64-bit squaring.
-#[inline]
-fn square64(x: __m512i) -> (__m512i, __m512i) {
-    unsafe {
-        // Get high 32 bits of x. See comment in mul64_64_s.
-        let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
-
-        // All pairwise multiplications.
-        let mul_ll = _mm512_mul_epu32(x, x);
-        let mul_lh = _mm512_mul_epu32(x, x_hi);
-        let mul_hh = _mm512_mul_epu32(x_hi, x_hi);
-
-        // Bignum addition, but mul_lh is shifted by 33 bits (not 32).
-        let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll);
-        let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi);
-        let t0_hi = _mm512_srli_epi64::<31>(t0);
-        let res_hi = _mm512_add_epi64(mul_hh, t0_hi);
-
-        // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high
-        // position).
-        let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh);
-        let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo);
-
-        (res_hi, res_lo)
-    }
-}
-
-/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order.
-///
-/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`.
-#[inline]
-fn reduce128(x: (__m512i, __m512i)) -> __m512i {
-    unsafe {
-        let (hi0, lo0) = x;
-
-        // Find the high 32 bits of hi0.
-        let hi_hi0 = _mm512_srli_epi64::<32>(hi0);
-
-        // Computes lo0_s - hi_hi0 mod FIELD_ORDER.
-        // Makes sense to do as 2^96 = -1 mod FIELD_ORDER.
-        // `sub_no_double_overflow_64_64` is safe to use as `hi_hi0 < 2^32`.
-        let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0);
-
-        // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER
-        // _mm256_mul_epu32 ignores the top 32 bits so just use that.
-        let t1 = _mm512_mul_epu32(hi0, EPSILON);
-
-        // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 < FIELD_ORDER so we can use `add_no_double_overflow_64_64` to get
-        // `lo1 + t1 mod FIELD_ORDER.`
-        add_no_double_overflow_64_64(lo1, t1)
-    }
-}
-
-/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`.
-///
-/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn mul(x: __m512i, y: __m512i) -> __m512i {
-    reduce128(mul64_64(x, y))
-}
-
-/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`.
-///
-/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`.
-#[inline]
-fn square(x: __m512i) -> __m512i {
-    reduce128(square64(x))
-}
-
-#[cfg(test)]
-mod tests {
-    use p3_field_testing::test_packed_field;
-
-    use super::{Goldilocks, PackedGoldilocksAVX512, WIDTH};
-
-    const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([
-        0xFFFF_FFFF_0000_0001,
-        0xFFFF_FFFF_0000_0000,
-        0xFFFF_FFFE_FFFF_FFFF,
-        0xFFFF_FFFF_FFFF_FFFF,
-        0x0000_0000_0000_0000,
-        0x0000_0000_0000_0001,
-        0x0000_0000_0000_0002,
-        0x0FFF_FFFF_F000_0000,
-    ]);
-
-    const ZEROS: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([
-        0x0000_0000_0000_0000,
-        0xFFFF_FFFF_0000_0001,
-        0x0000_0000_0000_0000,
-        0xFFFF_FFFF_0000_0001,
-        0x0000_0000_0000_0000,
-        0xFFFF_FFFF_0000_0001,
-        0x0000_0000_0000_0000,
-        0xFFFF_FFFF_0000_0001,
-    ]));
-
-    const ONES: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0002,
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0002,
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0002,
-        0x0000_0000_0000_0001,
-        0xFFFF_FFFF_0000_0002,
-    ]));
-
-    test_packed_field!(
-        crate::PackedGoldilocksAVX512,
-        &[super::ZEROS],
-        &[super::ONES],
-        crate::PackedGoldilocksAVX512(super::SPECIAL_VALS)
-    );
-}
diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index a0ace698d..a23f0144d 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -5,15 +5,14 @@
 # Usage:
 #   ./bench_vs_plonky3/run.sh [--log-rows K ...] [--num-sequences N] [--runs N]
 #                             [--lambda-only | --p3-only] [--report-dir DIR]
-#                             [--no-p3-patch] [--scalar] [--no-color]
+#                             [--scalar] [--no-color]
 #
 # Defaults: --log-rows 19, --num-sequences 16, --runs 3.
 # With multiple --log-rows values, prints one median row per size.
 #
-# --scalar: disables SIMD at the target-feature level. On x86_64 drops AVX2
-# and AVX-512 (Goldilocks + most of Keccak go scalar, residual SSE2 in
-# p3-keccak). On aarch64 drops the SHA3 NEON extension. Triggers a rebuild
-# when toggling; subsequent runs with the same RUSTFLAGS are cached.
+# --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks (and most of Keccak)
+# run scalar; residual SSE2 in p3-keccak remains. Triggers a rebuild when
+# toggling; subsequent runs with the same RUSTFLAGS are cached.
 
 set -euo pipefail
 
@@ -22,7 +21,6 @@ ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
 TMP_DIR="/tmp/bench_p3"
 REPORT_DIR=""
 NO_COLOR=false
-NO_P3_PATCH=false
 SCALAR=false
 
 RED='\033[0;31m'
@@ -70,10 +68,6 @@ while [[ $# -gt 0 ]]; do
             REPORT_DIR=$2
             shift 2
             ;;
-        --no-p3-patch)
-            NO_P3_PATCH=true
-            shift
-            ;;
         --scalar)
             SCALAR=true
             shift
@@ -122,78 +116,23 @@ if [ -n "$REPORT_DIR" ]; then
     mkdir -p "$REPORT_DIR/raw"
 fi
 
-# --- Patch toggle -----------------------------------------------------------
-# The root Cargo.toml has a [patch.crates-io] block pointing at the vendored
-# p3-goldilocks-patched (adds BinomiallyExtendable<3>, disables NEON). For the
-# nightly we build against vanilla crates.io p3-goldilocks — we comment the
-# block out and drop the `p3-degree3` feature.
-#
-# Both Cargo.toml AND Cargo.lock are backed up before the build: dropping the
-# patch makes cargo re-resolve p3-goldilocks against crates.io, which rewrites
-# Cargo.lock. The trap restores both so the working tree is clean on exit.
-CARGO_TOML="$ROOT_DIR/Cargo.toml"
-CARGO_LOCK="$ROOT_DIR/Cargo.lock"
-CARGO_TOML_BAK=""
-CARGO_LOCK_BAK=""
-BUILD_FEATURE_FLAGS=()
-if $NO_P3_PATCH; then
-    CARGO_TOML_BAK="$CARGO_TOML.bak.p3bench.$$"
-    cp "$CARGO_TOML" "$CARGO_TOML_BAK"
-    if [ -f "$CARGO_LOCK" ]; then
-        CARGO_LOCK_BAK="$CARGO_LOCK.bak.p3bench.$$"
-        cp "$CARGO_LOCK" "$CARGO_LOCK_BAK"
-    fi
-    # Comment the [patch.crates-io] block and its entries (until the next blank
-    # line or next [section]).
-    python3 - "$CARGO_TOML" <<'PY'
-import sys, pathlib
-path = pathlib.Path(sys.argv[1])
-lines = path.read_text().splitlines(keepends=True)
-out = []
-in_patch = False
-for ln in lines:
-    stripped = ln.strip()
-    if stripped == "[patch.crates-io]":
-        in_patch = True
-        out.append("# " + ln if not ln.startswith("#") else ln)
-        continue
-    if in_patch:
-        if stripped.startswith("[") and stripped.endswith("]"):
-            in_patch = False
-            out.append(ln)
-            continue
-        if stripped == "":
-            in_patch = False
-            out.append(ln)
-            continue
-        out.append("# " + ln if not ln.startswith("#") else ln)
-    else:
-        out.append(ln)
-path.write_text("".join(out))
-PY
-    trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi; if [ -n "$CARGO_LOCK_BAK" ] && [ -f "$CARGO_LOCK_BAK" ]; then mv "$CARGO_LOCK_BAK" "$CARGO_LOCK"; fi' EXIT INT TERM
-    BUILD_FEATURE_FLAGS=(--no-default-features --features parallel)
-fi
-
 # --- Scalar (no SIMD) toggle ------------------------------------------------
-# When --scalar is on, disable vector instruction sets for the build so both
-# provers run against the same scalar baseline. p3-keccak keeps SSE2 residual
-# on x86 — acceptable per the bench workstream (contribution is ~7%).
-#   x86_64   → -avx2,-avx512f         (Goldilocks + most of Keccak go scalar)
-#   aarch64  → -sha3                   (drops Keccak NEON SHA3 extension)
+# When --scalar is on, disable AVX2/AVX-512 so Goldilocks (and most of Keccak)
+# run scalar for an apples-to-apples comparison against Lambda STARK. The
+# residual SSE2 path on p3-keccak is intentionally left enabled — its
+# contribution to total prove time is ~7%.
 # Cargo caches per-RUSTFLAGS, so toggling scalar vs vector triggers a rebuild
 # on first use but is cached afterwards.
 SCALAR_RUSTFLAGS=""
+SCALAR_ACTIVE=false
 if $SCALAR; then
     case "$(uname -m)" in
         x86_64|amd64)
             SCALAR_RUSTFLAGS="-C target-feature=-avx2,-avx512f"
-            ;;
-        arm64|aarch64)
-            SCALAR_RUSTFLAGS="-C target-feature=-sha3"
+            SCALAR_ACTIVE=true
             ;;
         *)
-            echo "warning: --scalar: unknown arch $(uname -m); not pinning RUSTFLAGS" >&2
+            echo "warning: --scalar: only supported on x86_64; host is $(uname -m), not pinning RUSTFLAGS" >&2
             ;;
     esac
     if [ -n "$SCALAR_RUSTFLAGS" ]; then
@@ -210,24 +149,19 @@ echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}"
 echo -e "  log-rows:       ${YELLOW}${LOG_ROWS[*]}${NC}"
 echo -e "  num-sequences:  ${YELLOW}${NUM_SEQUENCES}${NC}  (columns = $((2 * NUM_SEQUENCES)))"
 echo -e "  runs/size:      ${YELLOW}${RUNS}${NC}  (median reported)"
-if $NO_P3_PATCH; then
-    echo -e "  p3 extension:   ${YELLOW}degree 2 (vanilla, no patch)${NC}"
-else
-    echo -e "  p3 extension:   ${YELLOW}degree 3 (patched, matches Lambda)${NC}"
-fi
-if $SCALAR; then
+echo -e "  p3 extension:   ${YELLOW}degree 3 (forked p3-goldilocks, matches Lambda)${NC}"
+if $SCALAR_ACTIVE; then
     echo -e "  scalar mode:    ${YELLOW}on${NC}  (arch=$(uname -m), RUSTFLAGS=\"${RUSTFLAGS:-}\")"
+elif $SCALAR; then
+    echo -e "  scalar mode:    ${YELLOW}requested (unsupported on $(uname -m))${NC}  (SIMD enabled, compiler default)"
 else
     echo -e "  scalar mode:    ${YELLOW}off${NC}  (SIMD enabled, compiler default)"
 fi
 echo ""
 
 echo -e "${GREEN}[build]${NC} prove_bench"
-# Use the `${arr[@]+...}` expansion so `set -u` doesn't blow up when the
-# feature-flag array is empty (bash 3 on macOS).
 cargo build --release -p bench-vs-plonky3 --bin prove_bench \
-    --manifest-path "$ROOT_DIR/Cargo.toml" \
-    ${BUILD_FEATURE_FLAGS[@]+"${BUILD_FEATURE_FLAGS[@]}"} 2>&1 | tail -5
+    --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -5
 
 # Resolve the actual target directory via cargo metadata so we find the binary
 # whether cargo used ./target/ (default) or a custom CARGO_TARGET_DIR.
@@ -378,10 +312,6 @@ echo ""
 if $RUN_LAMBDA && $RUN_P3; then
     echo -e "Timing window: single-shot end-to-end prove."
 fi
-if $NO_P3_PATCH; then
-    echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2."
-    echo -e "      Lambda keeps degree-3 — extension fields differ across sides."
-fi
 
 # --- Machine-readable report ------------------------------------------------
 
@@ -428,10 +358,14 @@ if [ -n "$REPORT_DIR" ]; then
         echo "fri_queries=219"
         echo "grinding=0"
         echo "runs_per_size=$RUNS"
-        echo "p3_extension=$($NO_P3_PATCH && echo 'degree2_vanilla' || echo 'degree3_patched')"
-        echo "scalar=$($SCALAR && echo on || echo off)"
-        if $SCALAR && [ -n "$SCALAR_RUSTFLAGS" ]; then
+        echo "p3_extension=degree3_fork"
+        if $SCALAR_ACTIVE; then
+            echo "scalar=on"
             echo "rustflags=$SCALAR_RUSTFLAGS"
+        elif $SCALAR; then
+            echo "scalar=requested_unsupported"
+        else
+            echo "scalar=off"
         fi
         echo "timing_window=single_shot_end_to_end_prove_no_verify"
         echo "log_rows_series=$(join_slash "${RESULT_LOG_ROWS[@]}")"
diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs
index b74f18ad2..cc57a3e5d 100644
--- a/bench_vs_plonky3/src/plonky3_config.rs
+++ b/bench_vs_plonky3/src/plonky3_config.rs
@@ -11,20 +11,11 @@ use p3_uni_stark::StarkConfig;
 
 pub type Val = Goldilocks;
 
-/// Cubic extension (default, `p3-degree3` feature): matches Lambda's
-/// `Degree3GoldilocksExtensionField`, irreducible x^3 - 2. Needs the vendored
-/// `p3-goldilocks-patched` crate (enabled via root `[patch.crates-io]`).
-#[cfg(feature = "p3-degree3")]
+/// Cubic extension matching Lambda's `Degree3GoldilocksExtensionField`
+/// (irreducible x^3 - 2). Provided by the forked `p3-goldilocks` via
+/// `BinomiallyExtendable<3>`.
 pub type Challenge = BinomialExtensionField<Val, 3>;
 
-/// Quadratic extension (vanilla upstream p3-goldilocks 0.5.2). Compiled when
-/// `p3-degree3` is disabled, typically together with commenting the root
-/// `[patch.crates-io]` block. Lambda still runs degree 3, so this is NOT a
-/// fair comparison on the extension field — it is used for nightly tracking
-/// against the off-the-shelf P3 config.
-#[cfg(not(feature = "p3-degree3"))]
-pub type Challenge = BinomialExtensionField<Val, 2>;
-
 type ByteHash = Keccak256Hash;
 type U64Hash = PaddingFreeSponge<KeccakF, 25, 17, 4>;
 type FieldHash = SerializingHasher<U64Hash>;
@@ -85,7 +76,7 @@ pub fn plonky3_benchmark_config() -> P3Config {
     let dft = Dft::default();
     let challenger = Challenger::from_hasher(vec![], byte_hash);
 
-    let fri_params = p3_fri::create_benchmark_fri_params(challenge_mmcs);
+    let fri_params = FriParameters::new_benchmark(challenge_mmcs);
 
     let pcs = Pcs::new(dft, val_mmcs, fri_params);
     P3Config::new(pcs, challenger)

From abaa0a2f2d25bbc86e253bea609a8b38979dbcff Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Mon, 20 Apr 2026 11:42:55 -0300
Subject: [PATCH 23/34] use ssh for p3 fork

---
 bench_vs_plonky3/Cargo.toml | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
index 5b313106f..239abf316 100644
--- a/bench_vs_plonky3/Cargo.toml
+++ b/bench_vs_plonky3/Cargo.toml
@@ -7,7 +7,10 @@ edition = "2024"
 # Lambda STARK
 stark = { path = "../crypto/stark", features = ["test-utils"] }
 crypto = { path = "../crypto/crypto", features = ["std", "serde"] }
-math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"] }
+math = { path = "../crypto/math", features = [
+    "std",
+    "lambdaworks-serde-binary",
+] }
 
 # Plonky3: pinned to the yetanotherco fork, branch `feat/goldilocks_deg3`.
 # The branch adds BinomiallyExtendable<3> for Goldilocks (x^3 - 2), matching
@@ -15,18 +18,22 @@ math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"]
 # the same git source + ref; declaring any of them as a crates.io dep would
 # pull in a second incompatible p3-field. cargo clones the fork once into
 # ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time.
-p3-air         = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-field       = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-goldilocks  = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-matrix      = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-commit      = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-challenger  = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-symmetric   = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-keccak      = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-fri         = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-uni-stark   = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] }
-p3-dft         = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] }
+p3-air = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-field = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-goldilocks = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-matrix = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-commit = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-challenger = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-symmetric = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-merkle-tree = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-keccak = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-fri = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-uni-stark = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
+    "parallel",
+] }
+p3-dft = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
+    "parallel",
+] }
 
 # Tracing for P3 span-based profiling
 tracing = "0.1"

From d3d41b0f5ff282b974d4aa5434eb579ea90062b8 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Mon, 20 Apr 2026 16:30:34 -0300
Subject: [PATCH 24/34] rm old files

---
 bench_vs_plonky3/ANALYSIS_LOG.md    | 432 ----------------------------
 bench_vs_plonky3/INSTRUMENTATION.md | 189 ------------
 bench_vs_plonky3/README.md          |  20 +-
 3 files changed, 4 insertions(+), 637 deletions(-)
 delete mode 100644 bench_vs_plonky3/ANALYSIS_LOG.md
 delete mode 100644 bench_vs_plonky3/INSTRUMENTATION.md

diff --git a/bench_vs_plonky3/ANALYSIS_LOG.md b/bench_vs_plonky3/ANALYSIS_LOG.md
deleted file mode 100644
index ab19e9a1f..000000000
--- a/bench_vs_plonky3/ANALYSIS_LOG.md
+++ /dev/null
@@ -1,432 +0,0 @@
-# Lambda STARK vs Plonky3 — Analysis Log
-
-## Session: 2026-04-14 to 2026-04-16
-
----
-
-## 0. Final Server Baseline (2026-04-16)
-
-**Config:** blowup=2, 219 queries, grinding=0, ext degree 3 both, scalar (no AVX2), parallel (rayon both), identical AIR (32 cols × 2^18).
-
-**Command:** `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench -p bench-vs-plonky3`
-
-### Prove
-
-| Prover | Time | Throughput |
-|--------|------|------------|
-| Lambda | **1.213 s** | 6.92 Melem/s |
-| Plonky3 | **479 ms** | 17.50 Melem/s |
-| **Ratio** | **2.53×** | |
-
-### Verify
-
-| Prover | Time |
-|--------|------|
-| Lambda | **23.3 ms** |
-| Plonky3 | **20.4 ms** |
-| **Ratio** | **1.14×** |
-
-### Gap attribution (734ms = 1213 - 479)
-
-Extension field is MATCHED (both degree 3). The 2.53× is pure algorithm/implementation:
-
-| Cause | Est. savings | % of gap | Effort |
-|-------|-------------|----------|--------|
-| **Quotient domain eval** (2^18 vs 2^19 LDE) | ~220ms | 30% | Low |
-| **Batched FFT** (coset_lde_batch vs per-column) | ~150ms | 20% | Medium |
-| **Alpha decomposition + monomorphization** | ~100ms | 14% | Medium-High |
-| **FRI folding parallel** | ~73ms | 10% | Very low |
-| **Boundary selectors** (vs zerofier precompute) | ~45ms | 6% | Low |
-| **Memory allocation patterns** | ~37ms | 5% | Low |
-| **SSE2 Keccak residual** (~7% hash advantage) | ~50ms | 7% | N/A (can't fix) |
-| Other (compilation, unrolling, tuning) | ~59ms | 8% | - |
-
-### Predicted instruments breakdown (blowup=2, 219q)
-
-| Phase | Predicted time | % |
-|-------|---------------|---|
-| FRI queries (R4) | 180ms | 15% ← NEW bottleneck (2.19× queries) |
-| R2 constraint eval | 168ms | 14% |
-| R4 deep comp poly | 131ms | 11% |
-| R1 Main Merkle | 105ms | 9% |
-| R4 FRI commit | 76ms | 6% |
-| R1 reconstruct LDE | 71ms | 6% |
-| R3 OOD eval | 71ms | 6% |
-| R1 Main LDE | 65ms | 5% |
-| R4 deep extend | 52ms | 4% |
-| R2 comp Merkle | 13ms | 1% |
-| Pre-pass | 11ms | 1% |
-
-### Optimization roadmap (ranked by impact/effort)
-
-| # | Optimization | Savings | Effort | Result |
-|---|-------------|---------|--------|--------|
-| 1 | Quotient domain (stride=blowup in evaluator) | ~80ms | 1h | 1.13s |
-| 2 | Parallel FRI fold (par_iter) | ~40ms | 30min | 1.09s |
-| 3 | Boundary selectors (replace zerofier precompute) | ~45ms | 2h | 1.05s |
-| 4 | LogUp alpha precompute | ~10ms | 30min | 1.04s |
-| 5 | Monomorphize constraints (enum dispatch) | ~35ms | 4h | 1.00s |
-| 6 | Batched FFT (coset_lde_batch pattern) | ~150ms | 8h | 0.85s |
-| 7 | Row-major trace storage | ~20ms | 8h | 0.83s |
-
-**With items 1-5 (~210ms, ~8h work):** Lambda ~1.0s vs Plonky3 0.48s = **2.08×**
-**With items 1-7 (~380ms, ~24h work):** Lambda ~0.83s vs Plonky3 0.48s = **1.73×**
-**Remaining gap** after all: ~350ms from SSE2 Keccak + deep comp + Plonky3 micro-optimizations
-
-### M1 instruments breakdown (with PR #492, blowup=2, ext3 both)
-
-**Command:** `RUSTFLAGS="-C target-feature=-sha3" cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture`
-
-| Fase | Lambda (1.068s) | % | Plonky3 (352ms) | % | Ratio |
-|------|-----------------|---|-----------------|---|-------|
-| Trace commit (LDE+Merkle) | 317ms (LDE 127 + Merkle 165) | 30% | 138ms (commit to trace data) | 39% | 2.3× |
-| **Constraint eval** | **325ms** | **30%** | **50ms** (quotient_values) | **14%** | **6.5×** |
-| Quotient commit | 53ms | 5% | 49ms | 14% | 1.1× |
-| OOD eval | 62ms | 6% | ~10ms (Lagrange interp) | 3% | 6.2× |
-| Deep comp poly | 173ms | 16% | (inside "open") | | |
-| Deep extend | 36ms | 3% | | | |
-| FRI commit (folding+Merkle) | 83ms | 8% | 47ms (commit phase) | 13% | 1.8× |
-| FRI queries | 1ms | 0% | 2ms (query phase) | 1% | — |
-| Open total | 293ms | 27% | 110ms | 31% | 2.7× |
-| Pre-pass | 7ms | 1% | — | | |
-
----
-
-## Fairness Audit
-
-### AIR equivalence: VERIFIED
-
-Both AIRs prove the same mathematical statement:
-- 32 cols × 2^18 rows, 2-row window
-- Constraint 1: `next_left = local_left + local_right`
-- Constraint 2: `next_right = local_right + next_left`
-- Boundary: row 0 pins `(a_s, b_s) = (s+1, s+2)` per sequence
-- Test `lambda_pair_trace_matches_plonky3_trace` verifies ALL cells (not subset)
-- Mathematical trace for seq (1,2): (1,2)→(3,5)→(8,13)→(21,34) — identical both sides
-
-### Parameters: ALL MATCHED (except noted)
-
-| Parameter | Lambda | Plonky3 | Status |
-|-----------|--------|---------|--------|
-| Base field | Goldilocks | Goldilocks | ✅ |
-| Extension | degree 3 (`x³−2`) | degree 3 (`x³−2`, vendored) | ✅ |
-| Blowup | 2 | 2 (log_blowup=1) | ✅ |
-| FRI queries | 219 | 219 | ✅ |
-| Grinding | 0 | 0 | ✅ |
-| Hash | Keccak-256 | Keccak-256 | ✅ |
-| Rayon | ON | ON (p3-uni-stark/parallel + p3-dft/parallel) | ✅ |
-| SIMD Goldilocks | OFF | OFF (NEON patched to `Self`) | ✅ |
-| SIMD Keccak (x86) | scalar (sha3 crate) | SSE2 2-wide | ⚠️ residual |
-| SIMD Keccak (M1 with -sha3) | scalar | scalar (fallback) | ✅ |
-
-### Platform fairness guide
-
-| Platform | Command | Keccak P3 | Goldilocks P3 | Fairness |
-|----------|---------|-----------|---------------|----------|
-| **M1 + `-sha3`** | `RUSTFLAGS="-C target-feature=-sha3" cargo bench ...` | Scalar | Scalar | **100% fair** |
-| M1 no flags | `cargo bench ...` | NEON SHA3 HW | Scalar | P3 has Keccak HW |
-| **x86 + `-avx2,-avx512f`** | `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ...` | SSE2 2-wide | Scalar | ~93% fair |
-| x86 no flags | `cargo bench ...` | AVX2 4-wide | AVX2 4-wide | P3 has full SIMD |
-
-**For fairest comparison: M1 with `-sha3`** — only platform where everything is scalar both sides.
-
-### Security model asymmetry (doesn't affect compute, affects interpretation)
-
-- **Lambda (Johnson Bound, proven):** 219 queries × 0.49 bits/query = **~108 bits** proven security
-- **Plonky3 (ethSTARK conjecture):** 219 queries × 1.0 bit/query = **~219 bits** conjectured (cap 192 by field)
-- Same 219 queries = same computational work. Different security interpretation.
-- For "matched security" at 108 conjectured bits, P3 would need only ~108 queries (half the FRI work)
-
-### What's NOT unfairness (architectural differences = what we measure)
-
-These are implementation choices, not benchmark bias:
-- Quotient domain eval (P3) vs full LDE eval (Lambda) → 6.5× constraint eval
-- Monomorphization (P3) vs vtable dispatch (Lambda) → ~1.2× overhead
-- Batched FFT (P3) vs per-column (Lambda) → ~2× trace commit
-- Row-major (P3) vs column-major (Lambda) → cache efficiency
-- Boundary selectors (P3) vs zerofier precompute (Lambda) → ~2× boundary cost
-
-### What IS potential unfairness
-
-1. SSE2 Keccak on x86 — P3 gets 2-wide Keccak, Lambda doesn't. ~7% of total. Unavoidable on x86.
-2. Lambda samples NO extra LogUp/bus challenges for this AIR (verified: `has_aux_trace() = false` skips sampling).
-3. Lambda wraps in `multi_prove` with vec of 1 — transcript clone overhead is negligible.
-
-**Conclusion: The benchmark is fair for comparing prover implementation efficiency.**
-
----
-
-## 1. Benchmark Setup
-
-### AIR (identical both sides)
-- 16 Fibonacci sequences, 2 cols/sequence = **32 columns**
-- **2^18 rows** (each row packs 2 Fibonacci steps → 2^19 effective steps)
-- 2-row window: `next.left = local.left + local.right`, `next.right = local.right + next.left`
-- 32 boundary constraints pinning initial values via public inputs
-- Test `lambda_pair_trace_matches_plonky3_trace` verifies cell-by-cell equivalence
-
-### Matched parameters
-- Base field: Goldilocks (p = 2^64 − 2^32 + 1)
-- Blowup: 4
-- FRI queries: 100
-- Grinding: 0
-- Hash: Keccak-256 (scalar on both sides when `-C target-feature=-sha3`)
-
-### Unmatched (architectural)
-- **Extension field:** Lambda degree 3 (`x^3 - 2`, 192-bit), Plonky3 degree 2 (`x^2 - 7`, 128-bit)
-  - Plonky3 0.5.2 has Goldilocks extensions for degree 2 and 5, but NOT degree 3
-  - Lambda ext-mul: 9 base muls + 3 reduce128
-  - Plonky3 ext-mul: 4 base muls + 2 adds
-- **Prover architecture:** Lambda multi_prove (even for 1 AIR), Plonky3 uni-stark
-
-### Patches applied
-1. `bench_vs_plonky3/vendor-p3-goldilocks/` — `Packing = Self` on aarch64 (disables NEON)
-2. `p3-uni-stark` and `p3-dft` features `["parallel"]` enabled
-3. `stark` feature `parallel` enabled by default in bench
-
-### Files
-- `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` — Lambda AIR matching P3 shape
-- `bench_vs_plonky3/src/plonky3_fibonacci.rs` — Plonky3 AIR
-- `bench_vs_plonky3/src/plonky3_config.rs` — P3 config (matched FRI params)
-- `bench_vs_plonky3/benches/stark_comparison.rs` — Criterion benchmark
-- `bench_vs_plonky3/vendor-p3-goldilocks/` — Patched p3-goldilocks (no NEON)
-- Root `Cargo.toml` — `[patch.crates-io]` for vendor p3-goldilocks
-
----
-
-## 2. Measurements
-
-### Config A: Both rayon, no SIMD, no SHA3 HW (M1 Max)
-
-Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3`
-
-| | Lambda | Plonky3 | Ratio |
-|--|--------|---------|-------|
-| **Prove** | **2.09s** [1.99, 2.20] | **0.86s** [0.84, 0.87] | **P3 2.43× faster** |
-| **Verify** | **6.58ms** | **6.76ms** | **Lambda 1.03× faster** |
-
-### Config B: Lambda rayon ON, Plonky3 rayon OFF, NEON ON (M1 — earlier run)
-
-Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3` (before adding p3 parallel features)
-
-| | Lambda | Plonky3 | Ratio |
-|--|--------|---------|-------|
-| **Prove** | **3.46s** | **2.92s** | **P3 1.18× faster** |
-
-### Config C: Lambda rayon ON, Plonky3 rayon OFF, NEON ON, SHA3 HW ON (M1 — first run)
-
-Command: `cargo bench -p bench-vs-plonky3` (no RUSTFLAGS)
-
-| | Lambda | Plonky3 | Ratio |
-|--|--------|---------|-------|
-| **Prove** | **3.21s** | **1.67s** | **P3 1.92× faster** |
-
-### Server instruments breakdown (Lambda only, 16 cols × 2^18 pair AIR)
-
-Total: **1.246s**
-
-| Phase | Time | % |
-|-------|------|---|
-| R2 constraint eval | 336ms | 27% |
-| R1 Main Merkle | 211ms | 17% |
-| R1 reconstruct (re-LDE) | 143ms | 11% |
-| R4 deep comp poly | 131ms | 11% |
-| R1 Main LDE | 130ms | 10% |
-| R4 FRI commit | 80ms | 6% |
-| R3 OOD eval | 71ms | 6% |
-| R2 comp Merkle | 54ms | 4% |
-| R4 deep extend | 43ms | 3% |
-| Pre-pass | 11ms | 1% |
-
----
-
-## 3. Root Cause Analysis
-
-### Why Plonky3 is ~2.4× faster (Config A)
-
-#### 3a. Constraint eval domain: 4× overhead (biggest factor)
-- Lambda evaluates constraints on full LDE domain: `N × blowup = 2^20 points` (`evaluator.rs:274`)
-- Plonky3 evaluates on quotient domain: `N = 2^18 points`, then extends via iFFT + FFT
-- Lambda does 4× more constraint evaluations (each involving ext-field ops, frame fill, zerofier division)
-- **Estimated contribution: 1.5-2× of the gap**
-
-#### 3b. Extension field degree 3 vs 2
-- Lambda: 9 base muls per ext-mul (`extensions_goldilocks.rs:293-309`)
-- Plonky3: 4 base muls per ext-mul (`binomial_extension.rs:747-762`)
-- Affects: composition poly, FRI folding, DEEP openings, OOD
-- **Estimated contribution: 1.3-1.5× of the gap**
-
-#### 3c. Virtual dispatch vs monomorphization
-- Lambda: `Vec<Box<dyn TransitionConstraint>>` → vtable call per constraint per point (`traits.rs:248-250`)
-- Plonky3: `air.eval(&mut folder)` → monomorphized, all constraints inlined
-- For 32 constraints × 2^20 points = 32M vtable dispatches in Lambda
-- **Estimated contribution: 1.1-1.2× of the gap**
-
-#### 3d. Data layout: column-major vs row-major
-- Lambda: column-major (cache miss per column access in constraint loop)
-- Plonky3: row-major (contiguous data per row)
-- **Estimated contribution: 1.05-1.1× of the gap**
-
-#### 3e. FRI folding sequential vs parallel
-- Lambda: sequential loop in `fold_evaluations_in_place` (`fri_functions.rs:21`)
-- Plonky3: `par_rows()` parallelized
-- **Estimated contribution: 1.03-1.05× of the gap**
-
-#### Combined: 1.5 × 1.4 × 1.15 × 1.07 × 1.04 ≈ **2.7× (close to measured 2.43×)**
-
-### Why verify is roughly equal
-- Verify doesn't do LDE, Merkle, or constraint eval
-- Only ~100 point openings + FRI check
-- Extension field penalty minimal at small N
-- Lambda's implementation is competitive on this path
-
----
-
-## 4. SIMD Analysis (from profiling session)
-
-### NEON (aarch64/M1)
-- `target_feature="neon"` and `target_feature="sha3"` are **default on aarch64-apple-darwin**
-- Plonky3 uses `PackedGoldilocksNeon` (WIDTH=2) unconditionally on aarch64 via `#[cfg(target_arch = "aarch64")]`
-- Plonky3 Keccak uses NEON SHA3 instructions (`veor3q_u64`, `vbcaxq_u64`, etc.)
-- Lambda has NO SIMD in the prover
-- **Goldilocks NEON base-field mul is 0.92× SLOWER** than scalar (no native 64×64→128 on NEON)
-- **Fp3 NEON mul is 1.40× faster** (parallelism helps with 3 components)
-- **FFT with SIMD was 0.88× (slower)** due to pack/unpack overhead
-
-### Disabling SIMD
-- NEON packing: patched via `vendor-p3-goldilocks` (`type Packing = Self` on aarch64)
-- SHA3 hardware Keccak: `-C target-feature=-sha3` (RUSTFLAGS)
-- Cannot disable NEON via RUSTFLAGS alone (intrinsics used without `#[target_feature]` annotation)
-
-### x86_64 (server)
-- Without `-C target-cpu=native`: only SSE2 (no AVX2) → Plonky3 scalar too
-- With AVX2: `PackedGoldilocksAVX2` (WIDTH=4) — has native `mulq` so SIMD IS beneficial
-- For fair scalar comparison on x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"`
-
----
-
-## 5. Plonky3 Parallelism
-
-- `p3-maybe-rayon` feature `parallel` is NOT enabled by default
-- Without it, all `par_iter()` calls fall back to `core::iter` (sequential)
-- `Radix2DitParallel` is "parallel" in name only without the feature
-- Must explicitly enable: `p3-uni-stark = { version = "0.5.2", features = ["parallel"] }` + `p3-dft = ...`
-- Verified via `cargo tree -e features | grep p3-maybe-rayon`
-
----
-
-## 6. Lambda Profiling Results (server, profile_prover, 2^20 × 16 cols)
-
-### Single-threaded (38.7s)
-| Component | % | Category |
-|-----------|---|----------|
-| Constraint evaluation | 32.1% | Compute |
-| Keccak hashing | 15.1% | Hashing |
-| Deep composition poly | 14.0% | Compute |
-| Merkle tree build | 12.0% | Hashing |
-| Field multiplication | 11.1% | Compute |
-| FFT | 10.5% | FFT |
-| Other | 5.2% | |
-
-### Parallel (12 threads, 19.2s — 2.02× speedup)
-| Metric | Value |
-|--------|-------|
-| Parallel efficiency | 16.8% of ideal 12× |
-| CPU utilization | 30.6% |
-| Main thread work | 13.3s |
-| Worker thread work | ~5s each |
-| New #1 bottleneck | Keccak (16.7%) |
-
-### Key profiling findings
-- 100% CPU-bound (no memory/IO stalls)
-- SIMD PackedGoldilocks types exist but are NOT used by prover
-- Iterator overhead (Map::fold + FnMut): 7.6%
-- Memory allocation overhead: 8.9% (page faults + malloc + cfree)
-- Amdahl's Law: ~34% serial portion limits parallel speedup
-
----
-
-## 7. Optimizations Implemented (then stashed)
-
-### Item 2: Parallel FRI folding
-- File: `crypto/stark/src/fri/fri_functions.rs`
-- Change: `(0..half).into_par_iter().map().collect()` with `#[cfg(feature = "parallel")]`
-- Also: `crypto/stark/src/fri/mod.rs` — added `Send + Sync` bounds
-- Tests: 450/450 passed (121 stark + 326 VM + 3 bench)
-
-### Item 3: Quotient domain constraint evaluation
-- File: `crypto/stark/src/constraints/evaluator.rs` — added `lde_stride: usize` parameter
-- File: `crypto/stark/src/prover.rs` — when `number_of_parts == 1`, uses `lde_stride = blowup_factor`
-  then extends N evaluations to LDE via `interpolate_offset_fft + evaluate_polynomial_on_lde_domain`
-- Tests: 450/450 passed
-- Impact on M1: 2.09s → 2.02s (~3%, within Criterion noise)
-- Impact limited because iFFT+FFT extension cost offsets constraint eval savings
-
-### Why stashed
-User wants clean baseline first (fair comparison), then optimize. These changes are ready to re-apply.
-
----
-
-## 8. Optimization Priority (from profiling data)
-
-### With parallel enabled (real-world scenario)
-
-| # | Optimization | Impact (parallel) | Effort | Status |
-|---|-------------|-------------------|--------|--------|
-| 1 | PR 492 (LDE cache) | 5-8% (reduces serial) | Done (PR open) | Waiting merge |
-| 2 | BLAKE3 hash | ~12% (Keccak is parallel bottleneck) | Low | Not started |
-| 3 | Quotient domain eval | 3-5% (constraint eval parallelized already) | Medium | Implemented, stashed |
-| 4 | Reduce allocations | 5-8% | Medium | Not started |
-| 5 | Parallel FRI fold | ~3% | Low | Implemented, stashed |
-| 6 | Monomorphize constraints | 3-5% | High | Not started |
-
-### Plonky3 degree-3 extension (Option C)
-- Would eliminate the last asymmetric variable in the comparison
-- Requires implementing `BinomiallyExtendable<3>` for Goldilocks in vendored crate
-- Need Sage computation for: `DTH_ROOT = 2^((p-1)/3)`, `EXT_GENERATOR`
-- Expected: gap drops from 2.43× to ~1.5-1.7× (confirms extension degree accounts for ~40% of gap)
-
----
-
-## 9. How to Run
-
-### M1 / aarch64 (scalar comparison)
-```bash
-RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3
-```
-
-### x86_64 server (scalar comparison, no AVX2)
-```bash
-cargo bench -p bench-vs-plonky3
-# or explicitly: RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ...
-```
-
-### With instruments (Lambda phase breakdown)
-```bash
-# Add "instruments" to stark features in bench_vs_plonky3/Cargo.toml first
-cargo bench -p bench-vs-plonky3 --features stark/instruments
-```
-
-### Verify correctness
-```bash
-cargo test -p bench-vs-plonky3  # 3 tests
-cargo test -p stark --lib       # 121 tests
-cargo test -p lambda-vm-prover  # 326 tests
-```
-
----
-
-## 10. Key Files Reference
-
-| File | Purpose |
-|------|---------|
-| `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` | Lambda AIR (32 cols, 2-row window) |
-| `bench_vs_plonky3/src/plonky3_fibonacci.rs` | Plonky3 AIR (matching) |
-| `bench_vs_plonky3/src/plonky3_config.rs` | P3 config (FRI params matched) |
-| `bench_vs_plonky3/benches/stark_comparison.rs` | Criterion benchmark |
-| `bench_vs_plonky3/vendor-p3-goldilocks/` | Patched p3-goldilocks (no NEON) |
-| `crypto/stark/src/constraints/evaluator.rs` | Constraint eval loop (bottleneck) |
-| `crypto/stark/src/prover.rs` | Prover pipeline (Round 1-4) |
-| `crypto/stark/src/fri/fri_functions.rs` | FRI folding |
-| `crypto/stark/src/domain.rs` | LDE domain definition |
-| `crypto/math/src/fft/polynomial.rs` | FFT / coset_lde_full_expand |
diff --git a/bench_vs_plonky3/INSTRUMENTATION.md b/bench_vs_plonky3/INSTRUMENTATION.md
deleted file mode 100644
index b7b6bd4b1..000000000
--- a/bench_vs_plonky3/INSTRUMENTATION.md
+++ /dev/null
@@ -1,189 +0,0 @@
-# `bench_vs_plonky3` — puntos de instrumentación
-
-Guía de referencia para revisores / handoff. Describe **dónde está cada timer
-y qué mide** en la comparación Lambda STARK vs Plonky3. No describe el AIR
-en sí (eso vive en `ANALYSIS_LOG.md`).
-
-## Cómo correrlo
-
-El test que imprime el breakdown se llama `instruments_breakdown`. Hay que
-compilar con la feature `instruments` y pasar `--nocapture` porque la salida
-va a stdout (si no, `cargo test` se la come).
-
-**x86 (Goldilocks scalar, SSE2 Keccak residual en P3):**
-
-```bash
-RUSTFLAGS="-C target-feature=-avx2,-avx512f" \
-cargo test -p bench-vs-plonky3 --features instruments --release -- \
-  instruments_breakdown --nocapture
-```
-
-## Entrada principal
-
-- Archivo: `bench_vs_plonky3/src/lib.rs`
-- Función: `instruments_breakdown` (línea 82)
-- AIR Fibonacci fijo:
-  - `num_sequences = 16`
-  - `rows = 1 << 18` (2^18)
-  - columns = 32 (2 por secuencia)
-  - `blowup_factor = 2`
-  - `fri_number_of_queries = 219`
-  - `grinding_factor = 0`
-
-El test hace dos pasadas independientes:
-
-1. Corre Lambda STARK con los timers internos del crate `stark` (feature
-   `instruments`).
-2. Corre Plonky3 con un `tracing_subscriber` custom que captura spans.
-
-## Feature flags
-
-`bench_vs_plonky3/Cargo.toml` (líneas 33-40):
-
-```toml
-[features]
-default    = ["parallel"]
-parallel   = ["stark/parallel"]
-instruments = ["stark/instruments"]
-```
-
-`crypto/stark/Cargo.toml` (líneas 35-41):
-
-```toml
-[features]
-instruments = []                       # prints de timing en prover/verifier
-parallel    = ["dep:rayon", "crypto/parallel"]
-```
-
-`instruments` y `parallel` **coexisten** (no son excluyentes). En la práctica
-los benchmarks corren siempre con ambos activos: Plonky3 usa
-`Radix2DitParallel` (rayon) unconditionally, así que Lambda también tiene que
-correr en paralelo para comparar apples-to-apples.
-
-## Lambda: estructuras de timing
-
-`crypto/stark/src/instruments.rs`.
-
-### `MultiProveTiming` (líneas 40-50)
-
-Recolectada dentro de `multi_prove` y consumida por el test vía
-`stark::instruments::take()`.
-
-| Campo | Qué mide |
-|---|---|
-| `prepass` | Construcción de domains + `LdeTwiddles` caches. |
-| `main_commits` | Round 1 Phase A: commit de todos los main traces. |
-| `aux_build` | Round 1 Phase B: construcción de aux traces / LogUp. |
-| `aux_commit` | Round 1 Phase B: LDE + Merkle commit de aux traces. |
-| `rounds_2_4` | Tiempo total de Rounds 2-4 (todas las tablas). |
-| `round1_sub` | Sub-op breakdown de Round 1 (`Round1SubOps`). |
-| `table_timings` | Por tabla: `(name, rows, duration, TableSubOps)`. |
-
-### `Round1SubOps` (líneas 28-37)
-
-Sub-ops dentro de Round 1. Se acumulan en `AtomicU64`, así que workers rayon
-las pueden incrementar en paralelo sin perder datos.
-
-| Campo | Qué mide |
-|---|---|
-| `main_lde` | Main trace: `expand_columns_to_lde` (LDE/FFT). |
-| `main_merkle` | Main trace: `commit_columns_bit_reversed` (Merkle). |
-| `aux_lde` | Aux trace: `expand_columns_to_lde`. |
-| `aux_merkle` | Aux trace: `commit_columns_bit_reversed`. |
-
-### `TableSubOps` (líneas 7-24)
-
-Por tabla, dentro de Rounds 2-4. Las partes de R2/R4 se pasan por
-thread-locals (`R2_SUB`, `R4_SUB`) y después se ensamblan en
-`prove_rounds_2_to_4` (ver más abajo).
-
-| Campo | Round | Qué mide |
-|---|---|---|
-| `constraints` | R2 | `evaluator.evaluate()` — constraints sobre dominio LDE. |
-| `comp_decompose` | R2 | `decompose_and_extend_d2` — iFFT + extensión del composition poly. |
-| `comp_commit` | R2 | Merkle commit del composition poly. |
-| `ood` | R3 | Barycentric OOD eval (ver nota sobre dónde se captura). |
-| `deep_comp` | R4 | `compute_deep_composition_poly_evaluations`. |
-| `deep_extend` | R4 | `interpolate_fft` + `evaluate_fft` para extender el deep comp poly. |
-| `fri_commit` | R4 | `fri::commit_phase_from_evaluations` (folds + Merkle layers). |
-| `queries` | R4 | Grinding (si hay) + sampling + FRI query phase + Merkle openings. |
-
-### Dónde se capturan (en `crypto/stark/src/prover.rs`)
-
-- `multi_prove` (línea 1490):
-  - `reset_all()` (1502).
-  - `prepass` timer (1515-1533).
-  - `main_commits` timer (1541-…).
-  - `aux_build`, `aux_commit` timers (durante Round 1 Phase B).
-  - `rounds_2_4` timer; al final: `store(MultiProveTiming)`.
-- `round_2_compute_composition_polynomial` — `constraints` / `comp_decompose` /
-  `comp_commit` (vía `store_r2_sub`).
-- `prove_rounds_2_to_4` — **acá** se captura el OOD:
-  `round_3_dur = t_r3.elapsed()` en líneas 1957-1967, y se guarda en
-  `TableSubOps.ood` (línea 2010). `round_3_evaluate_polynomials_in_out_of_domain_element`
-  **no** tiene instrumentación propia.
-- `round_4_compute_and_run_fri_on_the_deep_composition_polynomial` —
-  `deep_comp` / `deep_extend` / `fri_commit` / `queries`
-  (vía `store_r4_sub`).
-
-## Plonky3: breakdown por spans
-
-Todo vive dentro de `instruments_breakdown` en `bench_vs_plonky3/src/lib.rs`,
-después del bloque de Lambda.
-
-- Se define una `P3TimingLayer` custom (líneas 216-259) que implementa
-  `tracing_subscriber::Layer`:
-  - `on_new_span` guarda el nombre del span.
-  - `on_enter` guarda `Instant::now()`.
-  - `on_close` calcula `start.elapsed()` y lo empuja a un `Vec<(name, ms)>`.
-- Se monta un subscriber con `LevelFilter::DEBUG` (línea 266) y se instala
-  como default **sólo durante el `p3_uni_stark::prove`** (líneas 275-280,
-  scope con `_guard`).
-- Post-prove: orden descendente por duración (287), filtra spans con
-  `ms >= 0.1` (289), y calcula `(unaccounted) = total − Σspans` (293-301).
-
-### Qué implica el diseño
-
-- **La capa no filtra por crate**: captura *cualquier* span DEBUG emitido
-  mientras el subscriber está vivo. En la práctica sólo corre
-  `p3_uni_stark::prove` dentro de ese bloque, así que todos los spans que
-  salen son de Plonky3 — pero si alguien agrega un `#[instrument]` propio
-  dentro del scope del guard, también se va a contar.
-- **No hay instrumentación manual de funciones de Plonky3.** La granularidad
-  del breakdown = spans que Plonky3 ya emite internamente.
-- **Nesting / doble-conteo:** P3 tiene spans anidados (p.ej.
-  `prove ⊃ compute_quotient_values ⊃ evaluate_constraints`). Cada span se
-  cuenta una vez con su wall-clock entre `on_enter` y `on_close`, así que
-  **`Σspans > wall-clock` es esperable, no es un bug**. Consecuencia:
-  `(unaccounted) = total − Σspans` **puede quedar negativo** en presencia de
-  nesting — no significa que falte tiempo, significa que los spans padre se
-  solapan con sus hijos. El código sólo imprime `(unaccounted)` si
-  `> 1.0ms`, así que casos negativos se silencian.
-
-## Segunda capa de instrumentación (no la usa `bench_vs_plonky3`)
-
-Existe una capa adicional en `prover/src/instruments.rs` (líneas 54-211,
-`print_report`) — orientada al ejecutor del VM (execute + trace build + AIR
-construction) que además re-imprime el `MultiProveTiming` del STARK con
-otro formato. `bench_vs_plonky3` **no** la invoca; sólo consume
-`stark::instruments::take()` directamente. Vale la pena saberlo si buscás
-timings y aparecen en logs distintos.
-
-## Advertencias para el revisor
-
-1. Lambda: timing manual, específico del pipeline `multi_prove`. Granularidad
-   fina pero acoplada al código — moverlo rompe los breakpoints.
-2. Plonky3: span-based. Granularidad = la que P3 decida exponer. Si P3 deja
-   de emitir un span en una versión futura, la línea desaparece del reporte
-   sin previo aviso.
-3. Los porcentajes de Lambda se calculan contra el **total wall-clock del
-   test** (no contra `rounds_2_4`), así que la suma no cierra al 100% — hay
-   tiempo fuera de `multi_prove` (construcción de AIR, setup).
-4. Los porcentajes de Plonky3 se calculan contra **`p3_prove_dur`** (solo el
-   `prove`, sin setup).
-5. El benchmark usa **degree 3** para la extensión de Plonky3 vía git deps a
-   la rama `feat/goldilocks_deg3` del fork `yetanotherco/Plonky3` (ver
-   `bench_vs_plonky3/Cargo.toml`), que provee `BinomiallyExtendable<3>`
-   para Goldilocks con el mismo irreducible `x^3 - 2` que Lambda.
-6. Plataforma: x86 con `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` →
-   Goldilocks scalar, residual SSE2 en Keccak de P3 (~7%).
diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md
index fea3c8d7e..066582280 100644
--- a/bench_vs_plonky3/README.md
+++ b/bench_vs_plonky3/README.md
@@ -32,16 +32,6 @@ with `plonky3_config::matched_params_config`. Both AIRs are **cell-by-cell
 equivalent** — this is asserted by the `lambda_pair_trace_matches_plonky3_trace`
 test.
 
-## Prerequisites
-
-- Rust stable (the crate builds with `cargo build --release`).
-- No SP1 toolchain needed — there's no VM guest compilation.
-- Read access to `https://github.com/yetanotherco/Plonky3.git` (branch
-  `feat/goldilocks_deg3`). Cargo clones it into `~/.cargo/git/db` on the
-  first build and `Cargo.lock` pins the SHA. The branch provides
-  `BinomiallyExtendable<3>` for Goldilocks (`x^3 - 2`, matching Lambda's
-  `Degree3GoldilocksExtensionField`).
-
 ## Usage
 
 ```bash
@@ -163,9 +153,6 @@ cargo test -p bench-vs-plonky3 --features instruments --release -- \
     Spans nest (e.g. `prove ⊃ compute_quotient_values`), so Σspans > total is
     expected and not a bug. `(unaccounted)` can be negative from nesting.
 
-Details of every timer (which method it wraps, where it lives) are in
-[`INSTRUMENTATION.md`](INSTRUMENTATION.md).
-
 The nightly does **not** activate this path — it would add ~1 % overhead and
 pollute the historical wall-clock numbers.
 
@@ -182,6 +169,7 @@ pollute the historical wall-clock numbers.
   and AVX-512 so Goldilocks arithmetic is scalar on both sides. `p3-keccak`'s
   SSE2 path on x86 is not disabled.
 - **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both
-  sides. Security models differ (Lambda: Johnson-bound, ~108 bits; P3:
-  conjectured, ~192 bits) — the compute work is equivalent, the claimed
-  soundness is not. See `ANALYSIS_LOG.md` for the full fairness audit.
+  sides. Security models differ (Lambda: Johnson-bound, ~108 bits proven;
+  P3: conjectured, 219 queries × 1 bit = 219 bits, capped at 192 by the
+  cubic extension field) — the compute work is equivalent, the claimed
+  soundness is not.

From 3010b38c353a8d2036e6c6ea27e3c4f2afb7fd78 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Mon, 20 Apr 2026 17:11:48 -0300
Subject: [PATCH 25/34] adress comments

---
 bench_vs_plonky3/Cargo.toml            | 24 +++++------
 bench_vs_plonky3/run.sh                |  6 +--
 bench_vs_plonky3/src/lib.rs            | 56 ++++++++++++++++++++------
 bench_vs_plonky3/src/plonky3_config.rs | 13 ------
 4 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
index 239abf316..39f8c7330 100644
--- a/bench_vs_plonky3/Cargo.toml
+++ b/bench_vs_plonky3/Cargo.toml
@@ -18,20 +18,20 @@ math = { path = "../crypto/math", features = [
 # the same git source + ref; declaring any of them as a crates.io dep would
 # pull in a second incompatible p3-field. cargo clones the fork once into
 # ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time.
-p3-air = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-field = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-goldilocks = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-matrix = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-commit = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-challenger = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-symmetric = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-merkle-tree = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-keccak = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-fri = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-uni-stark = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
+p3-air = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-field = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-goldilocks = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-matrix = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-commit = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-challenger = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-symmetric = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-keccak = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-fri = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
+p3-uni-stark = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
     "parallel",
 ] }
-p3-dft = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
+p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
     "parallel",
 ] }
 
diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index a23f0144d..1bdedbb5c 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -18,7 +18,8 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
-TMP_DIR="/tmp/bench_p3"
+TMP_DIR="$(mktemp -d -t bench_p3.XXXXXX)"
+trap 'rm -rf "$TMP_DIR"' EXIT
 REPORT_DIR=""
 NO_COLOR=false
 SCALAR=false
@@ -109,9 +110,6 @@ if $NO_COLOR; then
     NC=''
 fi
 
-mkdir -p "$TMP_DIR"
-rm -rf "$TMP_DIR"/*
-
 if [ -n "$REPORT_DIR" ]; then
     mkdir -p "$REPORT_DIR/raw"
 fi
diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs
index d61c6ea9e..7c722153e 100644
--- a/bench_vs_plonky3/src/lib.rs
+++ b/bench_vs_plonky3/src/lib.rs
@@ -76,8 +76,9 @@ mod tests {
     }
 
     /// Lambda prove with instruments breakdown + P3 span-based breakdown.
-    /// Run: cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture
+    /// Run: cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --ignored --nocapture
     #[test]
+    #[ignore = "heavy: run with --release -- instruments_breakdown --ignored --nocapture"]
     fn instruments_breakdown() {
         let num_sequences = 16;
         let rows = 1 << 19;
@@ -211,8 +212,14 @@ mod tests {
 
         type SpanResults = Arc<Mutex<Vec<(String, f64)>>>;
 
+        struct SpanState {
+            name: String,
+            active_since: Option<std::time::Instant>,
+            accumulated: std::time::Duration,
+        }
+
         struct P3TimingLayer {
-            spans: Mutex<HashMap<u64, (String, Option<std::time::Instant>)>>,
+            spans: Mutex<HashMap<u64, SpanState>>,
             results: SpanResults,
         }
 
@@ -227,19 +234,39 @@ mod tests {
                 _ctx: tracing_subscriber::layer::Context<'_, S>,
             ) {
                 let name = attrs.metadata().name().to_string();
-                self.spans
-                    .lock()
-                    .unwrap()
-                    .insert(id.into_u64(), (name, None));
+                self.spans.lock().unwrap().insert(
+                    id.into_u64(),
+                    SpanState {
+                        name,
+                        active_since: None,
+                        accumulated: std::time::Duration::ZERO,
+                    },
+                );
             }
 
+            // Rayon can re-enter a span across threads, so only start timing on
+            // the first enter after each exit; accumulate every interval.
             fn on_enter(
                 &self,
                 id: &tracing::span::Id,
                 _ctx: tracing_subscriber::layer::Context<'_, S>,
             ) {
-                if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) {
-                    entry.1 = Some(std::time::Instant::now());
+                if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
+                    && entry.active_since.is_none()
+                {
+                    entry.active_since = Some(std::time::Instant::now());
+                }
+            }
+
+            fn on_exit(
+                &self,
+                id: &tracing::span::Id,
+                _ctx: tracing_subscriber::layer::Context<'_, S>,
+            ) {
+                if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
+                    && let Some(start) = entry.active_since.take()
+                {
+                    entry.accumulated += start.elapsed();
                 }
             }
 
@@ -248,10 +275,15 @@ mod tests {
                 id: tracing::span::Id,
                 _ctx: tracing_subscriber::layer::Context<'_, S>,
             ) {
-                if let Some((name, Some(start))) = self.spans.lock().unwrap().remove(&id.into_u64())
-                {
-                    let ms = start.elapsed().as_secs_f64() * 1000.0;
-                    self.results.lock().unwrap().push((name, ms));
+                if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) {
+                    // If we never saw on_exit (span closed while active), include
+                    // the dangling interval.
+                    let mut total = entry.accumulated;
+                    if let Some(start) = entry.active_since {
+                        total += start.elapsed();
+                    }
+                    let ms = total.as_secs_f64() * 1000.0;
+                    self.results.lock().unwrap().push((entry.name, ms));
                 }
             }
         }
diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs
index cc57a3e5d..971660f37 100644
--- a/bench_vs_plonky3/src/plonky3_config.rs
+++ b/bench_vs_plonky3/src/plonky3_config.rs
@@ -68,16 +68,3 @@ pub fn matched_params_config() -> P3Config {
     let pcs = Pcs::new(dft, val_mmcs, fri_params);
     P3Config::new(pcs, challenger)
 }
-
-/// Creates a Plonky3 STARK config with Plonky3's standard benchmark parameters:
-/// blowup=2, 100 FRI queries, 16-bit query PoW.
-pub fn plonky3_benchmark_config() -> P3Config {
-    let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs();
-    let dft = Dft::default();
-    let challenger = Challenger::from_hasher(vec![], byte_hash);
-
-    let fri_params = FriParameters::new_benchmark(challenge_mmcs);
-
-    let pcs = Pcs::new(dft, val_mmcs, fri_params);
-    P3Config::new(pcs, challenger)
-}

From 4b19d250f80221a0e7b8b0b70bae3285897c8572 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Mon, 4 May 2026 12:09:46 -0300
Subject: [PATCH 26/34] Migrate FibonacciPair AIR constraints

---
 bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
index 54c704976..751e86855 100644
--- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
+++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
@@ -25,7 +25,7 @@ use math::field::{
 use stark::{
     constraints::{
         boundary::{BoundaryConstraint, BoundaryConstraints},
-        transition::TransitionConstraint,
+        transition::TransitionConstraintEvaluator,
     },
     context::AirContext,
     proof::options::ProofOptions,
@@ -61,7 +61,7 @@ where
     }
 }
 
-impl<F, E> TransitionConstraint<F, E> for FibPairShiftConstraint<F, E>
+impl<F, E> TransitionConstraintEvaluator<F, E> for FibPairShiftConstraint<F, E>
 where
     F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
     E: IsField + Send + Sync,
@@ -78,7 +78,11 @@ where
         1
     }
 
-    fn evaluate(&self, eval_ctx: &TransitionEvaluationContext<F, E>, out: &mut [FieldElement<E>]) {
+    fn evaluate_verifier(
+        &self,
+        eval_ctx: &TransitionEvaluationContext<F, E>,
+        out: &mut [FieldElement<E>],
+    ) {
         match eval_ctx {
             TransitionEvaluationContext::Prover { frame, .. } => {
                 let s0 = frame.get_evaluation_step(0);
@@ -130,7 +134,7 @@ where
     }
 }
 
-impl<F, E> TransitionConstraint<F, E> for FibPairSumConstraint<F, E>
+impl<F, E> TransitionConstraintEvaluator<F, E> for FibPairSumConstraint<F, E>
 where
     F: IsSubFieldOf<E> + IsFFTField + Send + Sync,
     E: IsField + Send + Sync,
@@ -147,7 +151,11 @@ where
         1
     }
 
-    fn evaluate(&self, eval_ctx: &TransitionEvaluationContext<F, E>, out: &mut [FieldElement<E>]) {
+    fn evaluate_verifier(
+        &self,
+        eval_ctx: &TransitionEvaluationContext<F, E>,
+        out: &mut [FieldElement<E>],
+    ) {
         match eval_ctx {
             TransitionEvaluationContext::Prover { frame, .. } => {
                 let s0 = frame.get_evaluation_step(0);
@@ -184,7 +192,7 @@ where
     E: IsField + Send + Sync,
 {
     context: AirContext,
-    constraints: Vec<Box<dyn TransitionConstraint<F, E>>>,
+    constraints: Vec<Box<dyn TransitionConstraintEvaluator<F, E>>>,
     num_sequences: usize,
 }
 
@@ -209,7 +217,7 @@ where
         trace_length
     }
 
-    fn transition_constraints(&self) -> &Vec<Box<dyn TransitionConstraint<F, E>>> {
+    fn transition_constraints(&self) -> &Vec<Box<dyn TransitionConstraintEvaluator<F, E>>> {
         &self.constraints
     }
 
@@ -251,7 +259,7 @@ where
     E: IsField + Send + Sync + 'static,
 {
     pub fn with_num_sequences(proof_options: &ProofOptions, num_sequences: usize) -> Self {
-        let mut constraints: Vec<Box<dyn TransitionConstraint<F, E>>> =
+        let mut constraints: Vec<Box<dyn TransitionConstraintEvaluator<F, E>>> =
             Vec::with_capacity(2 * num_sequences);
         for seq in 0..num_sequences {
             constraints.push(Box::new(FibPairShiftConstraint::new(seq, 2 * seq)));

From 7ee4cbf01dea951ddd76f85decd02a5c25443185 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Mon, 4 May 2026 15:13:10 -0300
Subject: [PATCH 27/34] Add full metrics and phase breakdown to
 bench_vs_plonky3

---
 .github/workflows/bench-vs-p3-nightly.yml     |   2 +-
 bench_vs_plonky3/Cargo.toml                   |   3 +
 bench_vs_plonky3/run.sh                       | 312 +++++++++++--
 bench_vs_plonky3/src/bin/prove_bench.rs       | 414 +++++++++++++++++-
 bench_vs_plonky3/src/lambda_fibonacci_pair.rs |   7 +-
 bench_vs_plonky3/src/plonky3_config.rs        |  28 +-
 6 files changed, 706 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml
index d27bd9010..03fedad2b 100644
--- a/.github/workflows/bench-vs-p3-nightly.yml
+++ b/.github/workflows/bench-vs-p3-nightly.yml
@@ -34,7 +34,7 @@ jobs:
           bash ./bench_vs_plonky3/run.sh \
             --log-rows 19 \
             --num-sequences 16 \
-            --runs 3 \
+            --runs 10 \
             --scalar \
             --report-dir bench_vs_p3_artifacts \
             --no-color
diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
index 39f8c7330..92deaa31c 100644
--- a/bench_vs_plonky3/Cargo.toml
+++ b/bench_vs_plonky3/Cargo.toml
@@ -38,6 +38,9 @@ p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/g
 # Tracing for P3 span-based profiling
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
+libc = "0.2"
+serde = { version = "1.0", features = ["derive"] }
+serde_cbor = "0.11"
 
 [dev-dependencies]
 criterion = { version = "0.4", default-features = false }
diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index 1bdedbb5c..0098fed33 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -5,10 +5,10 @@
 # Usage:
 #   ./bench_vs_plonky3/run.sh [--log-rows K ...] [--num-sequences N] [--runs N]
 #                             [--lambda-only | --p3-only] [--report-dir DIR]
-#                             [--scalar] [--no-color]
+#                             [--scalar] [--breakdown] [--no-color]
 #
-# Defaults: --log-rows 19, --num-sequences 16, --runs 3.
-# With multiple --log-rows values, prints one median row per size.
+# Defaults: --log-rows 19, --num-sequences 16, --runs 10.
+# With multiple --log-rows values, prints one stats row per size.
 #
 # --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks (and most of Keccak)
 # run scalar; residual SSE2 in p3-keccak remains. Triggers a rebuild when
@@ -23,6 +23,7 @@ trap 'rm -rf "$TMP_DIR"' EXIT
 REPORT_DIR=""
 NO_COLOR=false
 SCALAR=false
+BREAKDOWN=false
 
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -32,7 +33,10 @@ NC='\033[0m'
 
 LOG_ROWS=()
 NUM_SEQUENCES=16
-RUNS=3
+RUNS=10
+BLOWUP=2
+FRI_QUERIES=219
+GRINDING=0
 RUN_LAMBDA=true
 RUN_P3=true
 
@@ -73,6 +77,10 @@ while [[ $# -gt 0 ]]; do
             SCALAR=true
             shift
             ;;
+        --breakdown)
+            BREAKDOWN=true
+            shift
+            ;;
         --no-color)
             NO_COLOR=true
             shift
@@ -146,8 +154,14 @@ fi
 echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}"
 echo -e "  log-rows:       ${YELLOW}${LOG_ROWS[*]}${NC}"
 echo -e "  num-sequences:  ${YELLOW}${NUM_SEQUENCES}${NC}  (columns = $((2 * NUM_SEQUENCES)))"
-echo -e "  runs/size:      ${YELLOW}${RUNS}${NC}  (median reported)"
+echo -e "  runs/size:      ${YELLOW}${RUNS}${NC}  (median + CV reported)"
 echo -e "  p3 extension:   ${YELLOW}degree 3 (forked p3-goldilocks, matches Lambda)${NC}"
+echo -e "  proof params:   ${YELLOW}blowup=${BLOWUP}, queries=${FRI_QUERIES}, grinding=${GRINDING}${NC}"
+if $BREAKDOWN; then
+    echo -e "  breakdown:      ${YELLOW}on${NC}  (Lambda instruments + P3 tracing spans)"
+else
+    echo -e "  breakdown:      ${YELLOW}off${NC}"
+fi
 if $SCALAR_ACTIVE; then
     echo -e "  scalar mode:    ${YELLOW}on${NC}  (arch=$(uname -m), RUSTFLAGS=\"${RUSTFLAGS:-}\")"
 elif $SCALAR; then
@@ -158,8 +172,11 @@ fi
 echo ""
 
 echo -e "${GREEN}[build]${NC} prove_bench"
-cargo build --release -p bench-vs-plonky3 --bin prove_bench \
-    --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -5
+BUILD_ARGS=(build --release -p bench-vs-plonky3 --bin prove_bench --manifest-path "$ROOT_DIR/Cargo.toml")
+if $BREAKDOWN; then
+    BUILD_ARGS+=(--features instruments)
+fi
+cargo "${BUILD_ARGS[@]}" 2>&1 | tail -5
 
 # Resolve the actual target directory via cargo metadata so we find the binary
 # whether cargo used ./target/ (default) or a custom CARGO_TARGET_DIR.
@@ -182,17 +199,30 @@ extract_proving_time() {
     }'
 }
 
+extract_metrics_line() {
+    sed -n '/^METRICS	/ {
+        p
+        q
+    }'
+}
+
+metric_value() {
+    local line=$1
+    local key=$2
+    printf '%s\n' "$line" | tr '\t' '\n' | LC_ALL=C awk -F= -v key="$key" '$1 == key { print $2; exit }'
+}
+
 median_of() {
-    # prints median of the given numeric arguments (rounded to 3 decimals).
+    # prints median of the given numeric arguments.
     # Uses shell `sort -g` for portability (macOS awk lacks gawk's asort).
     printf '%s\n' "$@" | LC_ALL=C sort -g | LC_NUMERIC=C awk '
         { a[NR] = $0 + 0 }
         END {
             if (NR == 0) { print "n/a"; exit }
             if (NR % 2 == 1) {
-                printf "%.3f\n", a[(NR + 1) / 2]
+                printf "%.6f\n", a[(NR + 1) / 2]
             } else {
-                printf "%.3f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2
+                printf "%.6f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2
             }
         }'
 }
@@ -204,6 +234,85 @@ ratio_fmt() {
     }'
 }
 
+mean_file() {
+    LC_NUMERIC=C awk '{ s += $1; n++ } END { if (n == 0) print "n/a"; else printf "%.6f\n", s / n }' "$1"
+}
+
+median_file() {
+    LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk '
+        { a[NR] = $0 + 0 }
+        END {
+            if (NR == 0) { print "n/a"; exit }
+            if (NR % 2 == 1) printf "%.6f\n", a[(NR + 1) / 2]
+            else printf "%.6f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2
+        }'
+}
+
+stddev_file() {
+    LC_NUMERIC=C awk '
+        { s += $1; ss += $1 * $1; n++ }
+        END {
+            if (n == 0) { print "n/a"; exit }
+            m = s / n
+            v = (ss / n) - (m * m)
+            if (v < 0) v = 0
+            printf "%.6f\n", sqrt(v)
+        }' "$1"
+}
+
+cv_pct_file() {
+    LC_NUMERIC=C awk '
+        { s += $1; ss += $1 * $1; n++ }
+        END {
+            if (n == 0) { print "n/a"; exit }
+            m = s / n
+            v = (ss / n) - (m * m)
+            if (v < 0) v = 0
+            sd = sqrt(v)
+            if (m == 0) print "n/a"
+            else printf "%.2f\n", sd * 100 / m
+        }' "$1"
+}
+
+min_file() {
+    LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk 'NR == 1 { printf "%.6f\n", $1; exit }'
+}
+
+max_file() {
+    LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk '{ x = $1 } END { if (NR == 0) print "n/a"; else printf "%.6f\n", x }'
+}
+
+fmt0() {
+    LC_NUMERIC=C awk -v v="$1" 'BEGIN { if (v == "n/a") print v; else printf "%.0f\n", v }'
+}
+
+metric_file_for() {
+    local metrics_file=$1
+    local key=$2
+    local out_file=$3
+    : > "$out_file"
+    while IFS= read -r line; do
+        local value
+        value=$(metric_value "$line" "$key")
+        if [ -n "$value" ] && [ "$value" != "n/a" ]; then
+            printf '%s\n' "$value" >> "$out_file"
+        fi
+    done < "$metrics_file"
+}
+
+median_metric() {
+    local prover=$1
+    local log_rows=$2
+    local key=$3
+    local file="$TMP_DIR/${prover}_${log_rows}_${key}.values"
+    metric_file_for "$TMP_DIR/${prover}_${log_rows}.metrics" "$key" "$file"
+    if [ ! -s "$file" ]; then
+        printf "n/a\n"
+    else
+        median_file "$file"
+    fi
+}
+
 # --- Run benchmark ----------------------------------------------------------
 
 RESULT_LOG_ROWS=()
@@ -211,34 +320,58 @@ RESULT_ROWS=()
 RESULT_LAMBDA=()
 RESULT_P3=()
 RESULT_RATIO=()
+RESULT_LAMBDA_CV=()
+RESULT_P3_CV=()
+RESULT_LAMBDA_VERIFY=()
+RESULT_P3_VERIFY=()
+RESULT_LAMBDA_PROOF_SIZE=()
+RESULT_P3_PROOF_SIZE=()
+RESULT_LAMBDA_RSS=()
+RESULT_P3_RSS=()
 
 run_prover() {
     local prover=$1   # lambda | p3
     local log_rows=$2
     local times=()
+    local metrics_file="$TMP_DIR/${prover}_${log_rows}.metrics"
+    local breakdown_file="$TMP_DIR/${prover}_${log_rows}.breakdown"
+    : > "$metrics_file"
+    : > "$breakdown_file"
     for run_i in $(seq 1 "$RUNS"); do
         local out_file="$TMP_DIR/${prover}_${log_rows}_${run_i}.stdout"
-        if ! "$BIN" --prover "$prover" \
-                --log-rows "$log_rows" \
-                --num-sequences "$NUM_SEQUENCES" > "$out_file" 2>&1; then
+        local run_args=(--prover "$prover" --log-rows "$log_rows" --num-sequences "$NUM_SEQUENCES" --blowup "$BLOWUP" --queries "$FRI_QUERIES" --grinding "$GRINDING")
+        if $BREAKDOWN; then
+            run_args+=(--breakdown)
+        fi
+        if ! "$BIN" "${run_args[@]}" > "$out_file" 2>&1; then
             echo -e "  ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}"
             cat "$out_file"
             exit 1
         fi
-        local t
-        t=$(extract_proving_time < "$out_file")
-        if [ -z "$t" ]; then
-            echo -e "  ${RED}[${prover}] could not parse proving time (log-rows=${log_rows}, run ${run_i})${NC}"
+        local metrics_line
+        metrics_line=$(extract_metrics_line < "$out_file")
+        if [ -z "$metrics_line" ]; then
+            echo -e "  ${RED}[${prover}] could not parse metrics (log-rows=${log_rows}, run ${run_i})${NC}"
             cat "$out_file"
             exit 1
         fi
+        printf '%s\n' "$metrics_line" >> "$metrics_file"
+        if $BREAKDOWN; then
+            sed -n "s/^BREAKDOWN	/BREAKDOWN	run=${run_i}	/p" "$out_file" >> "$breakdown_file"
+        fi
+
+        local t
+        t=$(metric_value "$metrics_line" prove_s)
+        if [ -z "$t" ]; then
+            t=$(extract_proving_time < "$out_file")
+        fi
         times+=("$t")
         if [ -n "$REPORT_DIR" ]; then
             cp "$out_file" "$REPORT_DIR/raw/${prover}_log${log_rows}_run${run_i}.stdout"
         fi
     done
-    median_of "${times[@]}"
     printf '%s\n' "${times[@]}" > "$TMP_DIR/${prover}_${log_rows}.times"
+    median_of "${times[@]}"
 }
 
 for lr in "${LOG_ROWS[@]}"; do
@@ -247,17 +380,33 @@ for lr in "${LOG_ROWS[@]}"; do
 
     lambda_median="n/a"
     p3_median="n/a"
+    lambda_cv="n/a"
+    p3_cv="n/a"
+    lambda_verify="n/a"
+    p3_verify="n/a"
+    lambda_proof_size="n/a"
+    p3_proof_size="n/a"
+    lambda_rss="n/a"
+    p3_rss="n/a"
 
     if $RUN_LAMBDA; then
         echo -ne "  ${GREEN}[lambda]${NC} "
         lambda_median=$(run_prover lambda "$lr")
-        echo -e "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")"
+        lambda_cv=$(cv_pct_file "$TMP_DIR/lambda_${lr}.times")
+        lambda_verify=$(median_metric lambda "$lr" verify_s)
+        lambda_proof_size=$(median_metric lambda "$lr" proof_size_bytes)
+        lambda_rss=$(median_metric lambda "$lr" peak_rss_kb)
+        echo -e "prove median ${BOLD}${lambda_median}s${NC} (CV ${lambda_cv}%), verify ${lambda_verify}s, proof $(fmt0 "$lambda_proof_size") B, rss $(fmt0 "$lambda_rss") KB"
     fi
 
     if $RUN_P3; then
         echo -ne "  ${GREEN}[p3]${NC}     "
         p3_median=$(run_prover p3 "$lr")
-        echo -e "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")"
+        p3_cv=$(cv_pct_file "$TMP_DIR/p3_${lr}.times")
+        p3_verify=$(median_metric p3 "$lr" verify_s)
+        p3_proof_size=$(median_metric p3 "$lr" proof_size_bytes)
+        p3_rss=$(median_metric p3 "$lr" peak_rss_kb)
+        echo -e "prove median ${BOLD}${p3_median}s${NC} (CV ${p3_cv}%), verify ${p3_verify}s, proof $(fmt0 "$p3_proof_size") B, rss $(fmt0 "$p3_rss") KB"
     fi
 
     local_ratio="n/a"
@@ -270,6 +419,14 @@ for lr in "${LOG_ROWS[@]}"; do
     RESULT_LAMBDA+=("$lambda_median")
     RESULT_P3+=("$p3_median")
     RESULT_RATIO+=("$local_ratio")
+    RESULT_LAMBDA_CV+=("$lambda_cv")
+    RESULT_P3_CV+=("$p3_cv")
+    RESULT_LAMBDA_VERIFY+=("$lambda_verify")
+    RESULT_P3_VERIFY+=("$p3_verify")
+    RESULT_LAMBDA_PROOF_SIZE+=("$lambda_proof_size")
+    RESULT_P3_PROOF_SIZE+=("$p3_proof_size")
+    RESULT_LAMBDA_RSS+=("$lambda_rss")
+    RESULT_P3_RSS+=("$p3_rss")
 done
 
 # --- Summary table ----------------------------------------------------------
@@ -277,11 +434,11 @@ done
 echo ""
 echo -e "${BOLD}=== Summary ===${NC}"
 if $RUN_LAMBDA && $RUN_P3; then
-    printf "  %-9s  %-12s  %14s  %14s  %10s\n" "log-rows" "rows" "Lambda (s)" "P3 (s)" "L/P3"
-    printf "  %-9s  %-12s  %14s  %14s  %10s\n" "--------" "----" "----------" "------" "----"
+    printf "  %-9s  %-12s  %14s  %9s  %14s  %9s  %10s\n" "log-rows" "rows" "Lambda (s)" "L CV%" "P3 (s)" "P3 CV%" "L/P3"
+    printf "  %-9s  %-12s  %14s  %9s  %14s  %9s  %10s\n" "--------" "----" "----------" "-----" "------" "------" "----"
 else
-    printf "  %-9s  %-12s  %14s\n" "log-rows" "rows" "Time (s)"
-    printf "  %-9s  %-12s  %14s\n" "--------" "----" "--------"
+    printf "  %-9s  %-12s  %14s  %9s\n" "log-rows" "rows" "Time (s)" "CV%"
+    printf "  %-9s  %-12s  %14s  %9s\n" "--------" "----" "--------" "---"
 fi
 
 for i in "${!RESULT_LOG_ROWS[@]}"; do
@@ -290,6 +447,8 @@ for i in "${!RESULT_LOG_ROWS[@]}"; do
     lt="${RESULT_LAMBDA[$i]}"
     pt="${RESULT_P3[$i]}"
     rt="${RESULT_RATIO[$i]}"
+    lcv="${RESULT_LAMBDA_CV[$i]}"
+    pcv="${RESULT_P3_CV[$i]}"
     if $RUN_LAMBDA && $RUN_P3; then
         color=$GREEN
         verdict="Lambda faster"
@@ -297,18 +456,18 @@ for i in "${!RESULT_LOG_ROWS[@]}"; do
             color=$RED
             verdict="P3 faster"
         fi
-        printf "  %-9s  %-12s  %13ss  %13ss  ${color}%9sx${NC}  (${color}%s${NC})\n" \
-            "$lr" "$rows" "$lt" "$pt" "$rt" "$verdict"
+        printf "  %-9s  %-12s  %13ss  %8s%%  %13ss  %8s%%  ${color}%9sx${NC}  (${color}%s${NC})\n" \
+            "$lr" "$rows" "$lt" "$lcv" "$pt" "$pcv" "$rt" "$verdict"
     elif $RUN_LAMBDA; then
-        printf "  %-9s  %-12s  %13ss\n" "$lr" "$rows" "$lt"
+        printf "  %-9s  %-12s  %13ss  %8s%%\n" "$lr" "$rows" "$lt" "$lcv"
     else
-        printf "  %-9s  %-12s  %13ss\n" "$lr" "$rows" "$pt"
+        printf "  %-9s  %-12s  %13ss  %8s%%\n" "$lr" "$rows" "$pt" "$pcv"
     fi
 done
 
 echo ""
 if $RUN_LAMBDA && $RUN_P3; then
-    echo -e "Timing window: single-shot end-to-end prove."
+    echo -e "Timing window: prove only for the ratio. Verify, proof size, RSS and throughput are reported separately."
 fi
 
 # --- Machine-readable report ------------------------------------------------
@@ -325,18 +484,86 @@ if [ -n "$REPORT_DIR" ]; then
     }
 
     {
-        printf "log_rows\trows\tlambda_median_s\tp3_median_s\tratio_lambda_over_p3\truns\n"
+        printf "log_rows\trows\tlambda_prove_median_s\tlambda_prove_cv_pct\tlambda_verify_median_s\tlambda_proof_size_bytes_median\tlambda_peak_rss_kb_median\tp3_prove_median_s\tp3_prove_cv_pct\tp3_verify_median_s\tp3_proof_size_bytes_median\tp3_peak_rss_kb_median\tratio_lambda_over_p3\truns\n"
         for i in "${!RESULT_LOG_ROWS[@]}"; do
-            printf "%s\t%s\t%s\t%s\t%s\t%s\n" \
+            printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
                 "${RESULT_LOG_ROWS[$i]}" \
                 "${RESULT_ROWS[$i]}" \
                 "${RESULT_LAMBDA[$i]}" \
+                "${RESULT_LAMBDA_CV[$i]}" \
+                "${RESULT_LAMBDA_VERIFY[$i]}" \
+                "${RESULT_LAMBDA_PROOF_SIZE[$i]}" \
+                "${RESULT_LAMBDA_RSS[$i]}" \
                 "${RESULT_P3[$i]}" \
+                "${RESULT_P3_CV[$i]}" \
+                "${RESULT_P3_VERIFY[$i]}" \
+                "${RESULT_P3_PROOF_SIZE[$i]}" \
+                "${RESULT_P3_RSS[$i]}" \
                 "${RESULT_RATIO[$i]}" \
                 "$RUNS"
         done
     } > "$REPORT_DIR/results.tsv"
 
+    {
+        printf "workload\tprover\tlog_rows\trows\tnum_sequences\tmain_cols\taux_cols\ttables\tlogup\tblowup\tfri_queries\tgrinding\tprove_s\tverify_s\tproof_size_bytes\tpeak_rss_kb\trows_per_sec\tcells_per_sec\n"
+        for lr in "${RESULT_LOG_ROWS[@]}"; do
+            for prover in lambda p3; do
+                metrics_file="$TMP_DIR/${prover}_${lr}.metrics"
+                if [ ! -f "$metrics_file" ]; then
+                    continue
+                fi
+                while IFS= read -r line; do
+                    printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+                        "$(metric_value "$line" workload)" \
+                        "$(metric_value "$line" prover)" \
+                        "$(metric_value "$line" log_rows)" \
+                        "$(metric_value "$line" rows)" \
+                        "$(metric_value "$line" num_sequences)" \
+                        "$(metric_value "$line" main_cols)" \
+                        "$(metric_value "$line" aux_cols)" \
+                        "$(metric_value "$line" tables)" \
+                        "$(metric_value "$line" logup)" \
+                        "$(metric_value "$line" blowup)" \
+                        "$(metric_value "$line" fri_queries)" \
+                        "$(metric_value "$line" grinding)" \
+                        "$(metric_value "$line" prove_s)" \
+                        "$(metric_value "$line" verify_s)" \
+                        "$(metric_value "$line" proof_size_bytes)" \
+                        "$(metric_value "$line" peak_rss_kb)" \
+                        "$(metric_value "$line" rows_per_sec)" \
+                        "$(metric_value "$line" cells_per_sec)"
+                done < "$metrics_file"
+            done
+        done
+    } > "$REPORT_DIR/raw_metrics.tsv"
+
+    if $BREAKDOWN; then
+        {
+            printf "run\tworkload\tprover\tlog_rows\trows\tphase\tms\ttable\ttable_rows\tspan\n"
+            for lr in "${RESULT_LOG_ROWS[@]}"; do
+                for prover in lambda p3; do
+                    breakdown_file="$TMP_DIR/${prover}_${lr}.breakdown"
+                    if [ ! -f "$breakdown_file" ]; then
+                        continue
+                    fi
+                    while IFS= read -r line; do
+                        printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+                            "$(metric_value "$line" run)" \
+                            "$(metric_value "$line" workload)" \
+                            "$(metric_value "$line" prover)" \
+                            "$(metric_value "$line" log_rows)" \
+                            "$(metric_value "$line" rows)" \
+                            "$(metric_value "$line" phase)" \
+                            "$(metric_value "$line" ms)" \
+                            "$(metric_value "$line" table)" \
+                            "$(metric_value "$line" table_rows)" \
+                            "$(metric_value "$line" span)"
+                    done < "$breakdown_file"
+                done
+            done
+        } > "$REPORT_DIR/breakdown.tsv"
+    fi
+
     # Capture commit + timestamp so the artifact is self-describing.
     git_sha="$(git -C "$ROOT_DIR" rev-parse HEAD 2>/dev/null || echo unknown)"
     git_dirty="clean"
@@ -352,10 +579,15 @@ if [ -n "$REPORT_DIR" ]; then
         echo "arch=$(uname -m)"
         echo "num_sequences=$NUM_SEQUENCES"
         echo "columns=$((2 * NUM_SEQUENCES))"
-        echo "blowup=2"
-        echo "fri_queries=219"
-        echo "grinding=0"
+        echo "blowup=$BLOWUP"
+        echo "fri_queries=$FRI_QUERIES"
+        echo "grinding=$GRINDING"
         echo "runs_per_size=$RUNS"
+        if $BREAKDOWN; then
+            echo "breakdown=on"
+        else
+            echo "breakdown=off"
+        fi
         echo "p3_extension=degree3_fork"
         if $SCALAR_ACTIVE; then
             echo "scalar=on"
@@ -365,11 +597,17 @@ if [ -n "$REPORT_DIR" ]; then
         else
             echo "scalar=off"
         fi
-        echo "timing_window=single_shot_end_to_end_prove_no_verify"
+        echo "timing_window=prove_only_ratio_verify_size_rss_reported_separately"
         echo "log_rows_series=$(join_slash "${RESULT_LOG_ROWS[@]}")"
         echo "rows_series=$(join_slash "${RESULT_ROWS[@]}")"
-        echo "lambda_medians=$(join_slash "${RESULT_LAMBDA[@]}")"
-        echo "p3_medians=$(join_slash "${RESULT_P3[@]}")"
+        echo "lambda_prove_medians=$(join_slash "${RESULT_LAMBDA[@]}")"
+        echo "p3_prove_medians=$(join_slash "${RESULT_P3[@]}")"
+        echo "lambda_verify_medians=$(join_slash "${RESULT_LAMBDA_VERIFY[@]}")"
+        echo "p3_verify_medians=$(join_slash "${RESULT_P3_VERIFY[@]}")"
+        echo "lambda_proof_size_medians=$(join_slash "${RESULT_LAMBDA_PROOF_SIZE[@]}")"
+        echo "p3_proof_size_medians=$(join_slash "${RESULT_P3_PROOF_SIZE[@]}")"
+        echo "lambda_peak_rss_medians=$(join_slash "${RESULT_LAMBDA_RSS[@]}")"
+        echo "p3_peak_rss_medians=$(join_slash "${RESULT_P3_RSS[@]}")"
         echo "ratios_lambda_over_p3=$(join_slash "${RESULT_RATIO[@]}")"
     } > "$REPORT_DIR/metrics.txt"
 fi
diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs
index cb58aea42..66d9baacd 100644
--- a/bench_vs_plonky3/src/bin/prove_bench.rs
+++ b/bench_vs_plonky3/src/bin/prove_bench.rs
@@ -1,17 +1,18 @@
 //! Minimal wall-clock benchmark harness for Lambda STARK vs Plonky3.
 //!
 //! Builds the same Fibonacci AIR as `instruments_breakdown` (but without any
-//! instrumentation) and prints a single line `Proving time: X.XXXs` to
-//! stdout, suitable for parsing by `bench_vs_plonky3/run.sh`.
+//! instrumentation) and prints human-readable timings plus one tab-separated
+//! `METRICS` line, suitable for parsing by `bench_vs_plonky3/run.sh`.
 //!
 //! Usage:
 //!   prove_bench --prover {lambda|p3} [--log-rows K] [--num-sequences N]
-//!               [--blowup B] [--queries Q] [--grinding G]
+//!               [--blowup B] [--queries Q] [--grinding G] [--breakdown]
 //!
 //! Defaults match production (`GoldilocksCubicProofOptions::with_blowup(2)`):
 //!   log-rows=19, num-sequences=16, blowup=2, queries=219, grinding=0.
 
 use std::process::ExitCode;
+use std::sync::{Arc, Mutex};
 use std::time::Instant;
 
 use bench_vs_plonky3::{lambda_fibonacci_pair, plonky3_config, plonky3_fibonacci};
@@ -21,6 +22,8 @@ use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
 use math::field::goldilocks::GoldilocksField;
 use stark::proof::options::ProofOptions;
 use stark::prover::{IsStarkProver, Prover};
+use stark::verifier::{IsStarkVerifier, Verifier};
+use tracing_subscriber::layer::SubscriberExt;
 
 type F = GoldilocksField;
 type E = Degree3GoldilocksExtensionField;
@@ -39,6 +42,14 @@ struct Args {
     blowup: u8,
     queries: usize,
     grinding: u8,
+    breakdown: bool,
+}
+
+struct BenchMetrics {
+    prove_s: f64,
+    verify_s: f64,
+    proof_size_bytes: usize,
+    peak_rss_kb: Option<u64>,
 }
 
 impl Default for Args {
@@ -50,6 +61,7 @@ impl Default for Args {
             blowup: 2,
             queries: 219,
             grinding: 0,
+            breakdown: false,
         }
     }
 }
@@ -58,7 +70,7 @@ fn print_usage() {
     eprintln!(
         "usage: prove_bench --prover {{lambda|p3}} \
          [--log-rows K] [--num-sequences N] \
-         [--blowup B] [--queries Q] [--grinding G]"
+         [--blowup B] [--queries Q] [--grinding G] [--breakdown]"
     );
 }
 
@@ -97,6 +109,9 @@ fn parse_args() -> Result<Args, String> {
                 let v = iter.next().ok_or("--grinding needs a value")?;
                 args.grinding = v.parse().map_err(|_| "--grinding: invalid u8")?;
             }
+            "--breakdown" => {
+                args.breakdown = true;
+            }
             "-h" | "--help" => {
                 print_usage();
                 std::process::exit(0);
@@ -113,6 +128,12 @@ fn parse_args() -> Result<Args, String> {
     if args.num_sequences == 0 {
         return Err("--num-sequences must be > 0".into());
     }
+    if !args.blowup.is_power_of_two() {
+        return Err("--blowup must be a power of two".into());
+    }
+    if args.queries == 0 {
+        return Err("--queries must be > 0".into());
+    }
     Ok(args)
 }
 
@@ -125,7 +146,282 @@ fn proof_options(args: &Args) -> ProofOptions {
     }
 }
 
-fn run_lambda(args: &Args) -> std::time::Duration {
+fn ms(seconds: f64) -> f64 {
+    seconds * 1000.0
+}
+
+fn print_breakdown(
+    prover: &str,
+    log_rows: u32,
+    rows: usize,
+    phase: &str,
+    elapsed_ms: f64,
+    extra: &str,
+) {
+    println!(
+        "BREAKDOWN\tworkload=fib_pair\tprover={prover}\tlog_rows={log_rows}\trows={rows}\tphase={phase}\tms={elapsed_ms:.3}{extra}"
+    );
+}
+
+#[cfg(feature = "instruments")]
+fn emit_lambda_breakdown(args: &Args, rows: usize, total_ms: f64) {
+    print_breakdown("lambda", args.log_rows, rows, "prove_total", total_ms, "");
+
+    if let Some(timing) = stark::instruments::take() {
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "prepass",
+            ms(timing.prepass.as_secs_f64()),
+            "",
+        );
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "main_commits",
+            ms(timing.main_commits.as_secs_f64()),
+            "",
+        );
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "aux_build",
+            ms(timing.aux_build.as_secs_f64()),
+            "",
+        );
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "aux_commit",
+            ms(timing.aux_commit.as_secs_f64()),
+            "",
+        );
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "rounds_2_4",
+            ms(timing.rounds_2_4.as_secs_f64()),
+            "",
+        );
+
+        let r1 = timing.round1_sub;
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "r1_main_lde",
+            ms(r1.main_lde.as_secs_f64()),
+            "",
+        );
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "r1_main_merkle",
+            ms(r1.main_merkle.as_secs_f64()),
+            "",
+        );
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "r1_aux_lde",
+            ms(r1.aux_lde.as_secs_f64()),
+            "",
+        );
+        print_breakdown(
+            "lambda",
+            args.log_rows,
+            rows,
+            "r1_aux_merkle",
+            ms(r1.aux_merkle.as_secs_f64()),
+            "",
+        );
+
+        for (name, table_rows, dur, sub) in timing.table_timings {
+            let extra = format!("\ttable={name}\ttable_rows={table_rows}");
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "table_total",
+                ms(dur.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r2_constraints",
+                ms(sub.constraints.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r2_comp_decompose",
+                ms(sub.comp_decompose.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r2_comp_commit",
+                ms(sub.comp_commit.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r3_ood",
+                ms(sub.ood.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r4_deep_comp",
+                ms(sub.deep_comp.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r4_deep_extend",
+                ms(sub.deep_extend.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r4_fri_commit",
+                ms(sub.fri_commit.as_secs_f64()),
+                &extra,
+            );
+            print_breakdown(
+                "lambda",
+                args.log_rows,
+                rows,
+                "r4_queries",
+                ms(sub.queries.as_secs_f64()),
+                &extra,
+            );
+        }
+    }
+}
+
+#[cfg(not(feature = "instruments"))]
+fn emit_lambda_breakdown(args: &Args, rows: usize, total_ms: f64) {
+    print_breakdown("lambda", args.log_rows, rows, "prove_total", total_ms, "");
+    eprintln!("warning: Lambda phase breakdown requires building with --features instruments");
+}
+
+struct SpanState {
+    name: String,
+    active_since: Option<Instant>,
+    accumulated: std::time::Duration,
+}
+
+struct P3TimingLayer {
+    spans: Mutex<std::collections::HashMap<u64, SpanState>>,
+    results: Arc<Mutex<Vec<(String, f64)>>>,
+}
+
+impl<S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>>
+    tracing_subscriber::Layer<S> for P3TimingLayer
+{
+    fn on_new_span(
+        &self,
+        attrs: &tracing::span::Attributes<'_>,
+        id: &tracing::span::Id,
+        _ctx: tracing_subscriber::layer::Context<'_, S>,
+    ) {
+        self.spans.lock().unwrap().insert(
+            id.into_u64(),
+            SpanState {
+                name: attrs.metadata().name().to_string(),
+                active_since: None,
+                accumulated: std::time::Duration::ZERO,
+            },
+        );
+    }
+
+    fn on_enter(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
+        if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
+            && entry.active_since.is_none()
+        {
+            entry.active_since = Some(Instant::now());
+        }
+    }
+
+    fn on_exit(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
+        if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
+            && let Some(start) = entry.active_since.take()
+        {
+            entry.accumulated += start.elapsed();
+        }
+    }
+
+    fn on_close(&self, id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
+        if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) {
+            let mut total = entry.accumulated;
+            if let Some(start) = entry.active_since {
+                total += start.elapsed();
+            }
+            self.results
+                .lock()
+                .unwrap()
+                .push((entry.name, ms(total.as_secs_f64())));
+        }
+    }
+}
+
+type P3SpanResults = Arc<Mutex<Vec<(String, f64)>>>;
+
+fn p3_span_subscriber() -> (impl tracing::Subscriber + Send + Sync, P3SpanResults) {
+    let results = Arc::new(Mutex::new(Vec::new()));
+    let layer = P3TimingLayer {
+        spans: Mutex::new(std::collections::HashMap::new()),
+        results: Arc::clone(&results),
+    };
+    let filter = tracing_subscriber::filter::LevelFilter::DEBUG;
+    (
+        tracing_subscriber::registry().with(filter).with(layer),
+        results,
+    )
+}
+
+fn peak_rss_kb() -> Option<u64> {
+    let mut usage = std::mem::MaybeUninit::<libc::rusage>::uninit();
+    // SAFETY: getrusage initializes `usage` when it returns 0.
+    let rc = unsafe { libc::getrusage(libc::RUSAGE_SELF, usage.as_mut_ptr()) };
+    if rc != 0 {
+        return None;
+    }
+
+    let maxrss = unsafe { usage.assume_init().ru_maxrss };
+    #[cfg(target_os = "macos")]
+    {
+        Some((maxrss as u64).div_ceil(1024))
+    }
+    #[cfg(not(target_os = "macos"))]
+    {
+        Some(maxrss as u64)
+    }
+}
+
+fn run_lambda(args: &Args) -> BenchMetrics {
     let rows = 1usize << args.log_rows;
     let options = proof_options(args);
 
@@ -148,21 +444,80 @@ fn run_lambda(args: &Args) -> std::time::Duration {
         &mut DefaultTranscript::<E>::new(&[]),
     )
     .expect("lambda prove failed");
-    start.elapsed()
+    let prove_s = start.elapsed().as_secs_f64();
+    if args.breakdown {
+        emit_lambda_breakdown(args, rows, ms(prove_s));
+    }
+
+    let proof_size_bytes = serde_cbor::to_vec(&_proof)
+        .expect("lambda proof serialization failed")
+        .len();
+
+    let start = Instant::now();
+    let verified =
+        Verifier::<F, E, _>::verify(&_proof, &air, &mut DefaultTranscript::<E>::new(&[]));
+    let verify_s = start.elapsed().as_secs_f64();
+    assert!(verified, "lambda verify failed");
+
+    BenchMetrics {
+        prove_s,
+        verify_s,
+        proof_size_bytes,
+        peak_rss_kb: peak_rss_kb(),
+    }
 }
 
-fn run_p3(args: &Args) -> std::time::Duration {
+fn run_p3(args: &Args) -> BenchMetrics {
     let rows = 1usize << args.log_rows;
-    let config = plonky3_config::matched_params_config();
+    let config = plonky3_config::params_config(args.blowup, args.queries, args.grinding);
     let air = plonky3_fibonacci::P3FibonacciAir {
         num_sequences: args.num_sequences,
     };
     let trace = plonky3_fibonacci::generate_fibonacci_trace(args.num_sequences, rows);
     let pis = plonky3_fibonacci::public_values(args.num_sequences);
 
+    let (prove_s, _proof, span_results) = if args.breakdown {
+        let (subscriber, results) = p3_span_subscriber();
+        let start = Instant::now();
+        let proof = {
+            let _guard = tracing::subscriber::set_default(subscriber);
+            p3_uni_stark::prove(&config, &air, trace, &pis)
+        };
+        (start.elapsed().as_secs_f64(), proof, Some(results))
+    } else {
+        let start = Instant::now();
+        let proof = p3_uni_stark::prove(&config, &air, trace, &pis);
+        (start.elapsed().as_secs_f64(), proof, None)
+    };
+
+    if args.breakdown {
+        print_breakdown("p3", args.log_rows, rows, "prove_total", ms(prove_s), "");
+        if let Some(results) = span_results {
+            let mut span_data = results.lock().unwrap().clone();
+            span_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+            for (name, elapsed_ms) in span_data {
+                if elapsed_ms >= 0.1 {
+                    let extra = format!("\tspan={name}");
+                    print_breakdown("p3", args.log_rows, rows, "span", elapsed_ms, &extra);
+                }
+            }
+        }
+    }
+
+    let proof_size_bytes = serde_cbor::to_vec(&_proof)
+        .expect("p3 proof serialization failed")
+        .len();
+
     let start = Instant::now();
-    let _proof = p3_uni_stark::prove(&config, &air, trace, &pis);
-    start.elapsed()
+    p3_uni_stark::verify(&config, &air, &_proof, &pis).expect("p3 verify failed");
+    let verify_s = start.elapsed().as_secs_f64();
+
+    BenchMetrics {
+        prove_s,
+        verify_s,
+        proof_size_bytes,
+        peak_rss_kb: peak_rss_kb(),
+    }
 }
 
 fn main() -> ExitCode {
@@ -175,11 +530,46 @@ fn main() -> ExitCode {
         }
     };
 
-    let elapsed = match args.prover {
+    let metrics = match args.prover {
         ProverKind::Lambda => run_lambda(&args),
         ProverKind::P3 => run_p3(&args),
     };
 
-    println!("Proving time: {:.3}s", elapsed.as_secs_f64());
+    let prover_name = match args.prover {
+        ProverKind::Lambda => "lambda",
+        ProverKind::P3 => "p3",
+    };
+    let rows = 1usize << args.log_rows;
+    let main_cols = 2 * args.num_sequences;
+    let aux_cols = 0usize;
+    let cells = rows * main_cols;
+    let rows_per_sec = rows as f64 / metrics.prove_s;
+    let cells_per_sec = cells as f64 / metrics.prove_s;
+    let peak_rss_kb = metrics
+        .peak_rss_kb
+        .map(|v| v.to_string())
+        .unwrap_or_else(|| "n/a".to_string());
+
+    println!("Proving time: {:.6}s", metrics.prove_s);
+    println!("Verification time: {:.6}s", metrics.verify_s);
+    println!("Proof size: {} bytes", metrics.proof_size_bytes);
+    println!("Peak RSS: {peak_rss_kb} KB");
+    println!(
+        "METRICS\tworkload=fib_pair\tprover={prover_name}\tlog_rows={}\trows={rows}\t\
+         num_sequences={}\tmain_cols={main_cols}\taux_cols={aux_cols}\ttables=1\t\
+         logup=false\tblowup={}\tfri_queries={}\tgrinding={}\tprove_s={:.6}\t\
+         verify_s={:.6}\tproof_size_bytes={}\tpeak_rss_kb={peak_rss_kb}\t\
+         rows_per_sec={:.3}\tcells_per_sec={:.3}",
+        args.log_rows,
+        args.num_sequences,
+        args.blowup,
+        args.queries,
+        args.grinding,
+        metrics.prove_s,
+        metrics.verify_s,
+        metrics.proof_size_bytes,
+        rows_per_sec,
+        cells_per_sec,
+    );
     ExitCode::SUCCESS
 }
diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
index 751e86855..9c1ca6024 100644
--- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
+++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
@@ -180,7 +180,8 @@ where
 }
 
 /// Public inputs: initial `(a, b) = (left, right)` pair for each sequence.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
+#[serde(bound = "")]
 pub struct FibonacciPairPublicInputs<F: IsFFTField> {
     pub initial_values: Vec<(FieldElement<F>, FieldElement<F>)>,
 }
@@ -209,6 +210,10 @@ where
         1
     }
 
+    fn name(&self) -> &str {
+        "fib_pair"
+    }
+
     fn new(proof_options: &ProofOptions) -> Self {
         Self::with_num_sequences(proof_options, 2)
     }
diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs
index 971660f37..a266cac04 100644
--- a/bench_vs_plonky3/src/plonky3_config.rs
+++ b/bench_vs_plonky3/src/plonky3_config.rs
@@ -45,22 +45,25 @@ fn build_mmcs() -> (ValMmcs, ChallengeMmcs, ByteHash) {
     (val_mmcs, challenge_mmcs, byte_hash)
 }
 
-/// Creates a Plonky3 STARK config with parameters matched to Lambda's
-/// production config `GoldilocksCubicProofOptions::with_blowup(2)`:
-/// blowup=2, 219 FRI queries, grinding=0 (excluded from benchmark).
-pub fn matched_params_config() -> P3Config {
+/// Creates a Plonky3 STARK config with parameters matched to Lambda's proof
+/// options. `blowup` must be a power of two because Plonky3 stores it as
+/// `log_blowup`.
+pub fn params_config(blowup: u8, queries: usize, grinding: u8) -> P3Config {
+    assert!(
+        blowup.is_power_of_two(),
+        "blowup must be a power of two for Plonky3"
+    );
+
     let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs();
     let dft = Dft::default();
     let challenger = Challenger::from_hasher(vec![], byte_hash);
 
-    // Match Lambda production: blowup=2, queries=219, grinding=0.
-    // Grinding excluded from benchmark (identical PoW on both sides).
     let fri_params = FriParameters {
-        log_blowup: 1, // blowup = 2
+        log_blowup: blowup.trailing_zeros() as usize,
         log_final_poly_len: 0,
         max_log_arity: 1,
-        num_queries: 219,
-        commit_proof_of_work_bits: 0,
+        num_queries: queries,
+        commit_proof_of_work_bits: grinding as usize,
         query_proof_of_work_bits: 0,
         mmcs: challenge_mmcs,
     };
@@ -68,3 +71,10 @@ pub fn matched_params_config() -> P3Config {
     let pcs = Pcs::new(dft, val_mmcs, fri_params);
     P3Config::new(pcs, challenger)
 }
+
+/// Creates a Plonky3 STARK config with parameters matched to Lambda's
+/// production config `GoldilocksCubicProofOptions::with_blowup(2)`:
+/// blowup=2, 219 FRI queries, grinding=0.
+pub fn matched_params_config() -> P3Config {
+    params_config(2, 219, 0)
+}

From 6b57545b88ef461b0f37dd6422a920bcd40b2b91 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 12 May 2026 18:34:45 -0300
Subject: [PATCH 28/34]  Add Lambda-vs-Plonky3 sections to the nightly Slack
 post

---
 .github/scripts/publish_bench_vs.sh       | 121 +++++++++++++++++++++-
 .github/workflows/bench-vs-nightly.yml    |  11 ++
 .github/workflows/bench-vs-p3-nightly.yml |  47 ---------
 bench_vs_plonky3/Cargo.toml               |  38 +++----
 bench_vs_plonky3/run_p3_nightly.sh        |  51 +++++++++
 bench_vs_plonky3/src/plonky3_config.rs    |   8 +-
 6 files changed, 198 insertions(+), 78 deletions(-)
 delete mode 100644 .github/workflows/bench-vs-p3-nightly.yml
 create mode 100755 bench_vs_plonky3/run_p3_nightly.sh

diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh
index 4408c17c0..f30dcce52 100644
--- a/.github/scripts/publish_bench_vs.sh
+++ b/.github/scripts/publish_bench_vs.sh
@@ -79,6 +79,125 @@ if [ -n "$LAMBDA_PROJECTED_H" ] || [ -n "$SP1_PROJECTED_H" ]; then
     PROJ_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Linear Projection"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$PROJ_MRKDWN"'"}}'
 fi
 
+# --- Plonky3 section (optional) --------------------------------------------
+# Built when `bench_vs_artifacts/p3/headline/metrics.txt` exists. The headline
+# row comes from that file; column-scaling rows are read from the per-N
+# subdirs written by the workflow.
+
+p3_parse() {
+    local file=$1
+    local key=$2
+    { grep "^${key}=" "$file" 2>/dev/null || true; } | cut -d= -f2-
+}
+
+p3_fmt_seconds() {
+    LC_NUMERIC=C awk -v s="$1" 'BEGIN {
+        if (s == "") { print "n/a"; exit }
+        if (s + 0 < 1) printf "%.0fms", s * 1000
+        else printf "%.3fs", s
+    }'
+}
+
+p3_fmt_mb() {
+    LC_NUMERIC=C awk -v b="$1" 'BEGIN {
+        if (b == "") { print "n/a"; exit }
+        printf "%.1f MB", b / (1024 * 1024)
+    }'
+}
+
+p3_fmt_gb() {
+    LC_NUMERIC=C awk -v kb="$1" 'BEGIN {
+        if (kb == "") { print "n/a"; exit }
+        printf "%.2f GB", kb / (1024 * 1024)
+    }'
+}
+
+p3_fmt_ratio_pair() {
+    LC_NUMERIC=C awk -v a="$1" -v b="$2" 'BEGIN {
+        if (a == "" || b == "" || b + 0 == 0) { print "n/a"; exit }
+        printf "%.2fx", a / b
+    }'
+}
+
+P3_SECTION=""
+P3_HEADLINE_FILE="bench_vs_artifacts/p3/headline/metrics.txt"
+if [ -f "$P3_HEADLINE_FILE" ]; then
+    H_LOG_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "log_rows_series")
+    H_COLS=$(p3_parse "$P3_HEADLINE_FILE" "columns")
+    H_BLOWUP=$(p3_parse "$P3_HEADLINE_FILE" "blowup")
+    H_QUERIES=$(p3_parse "$P3_HEADLINE_FILE" "fri_queries")
+    H_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "rows_series")
+    H_LAMBDA_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "lambda_prove_medians")
+    H_P3_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "p3_prove_medians")
+    H_LAMBDA_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "lambda_verify_medians")
+    H_P3_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "p3_verify_medians")
+    H_LAMBDA_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "lambda_proof_size_medians")
+    H_P3_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "p3_proof_size_medians")
+    H_LAMBDA_RSS=$(p3_parse "$P3_HEADLINE_FILE" "lambda_peak_rss_medians")
+    H_P3_RSS=$(p3_parse "$P3_HEADLINE_FILE" "p3_peak_rss_medians")
+    H_RATIO=$(p3_parse "$P3_HEADLINE_FILE" "ratios_lambda_over_p3")
+
+    H_ROWS_FMT=$(LC_NUMERIC=C awk -v r="$H_ROWS" 'BEGIN {
+        if (r == "") { print "n/a"; exit }
+        if (r + 0 >= 1000000) printf "%.1fM", r / 1000000
+        else if (r + 0 >= 1000) printf "%.0fK", r / 1000
+        else printf "%d", r
+    }')
+
+    PROOF_RATIO=$(p3_fmt_ratio_pair "$H_LAMBDA_PROOF" "$H_P3_PROOF")
+    RSS_RATIO=$(p3_fmt_ratio_pair "$H_LAMBDA_RSS" "$H_P3_RSS")
+    PROVE_RATIO_FMT=$(LC_NUMERIC=C awk -v r="$H_RATIO" 'BEGIN {
+        if (r == "" || r == "n/a") { print "n/a"; exit }
+        printf "%.2fx", r
+    }')
+
+    P3_HEADLINE_MRKDWN="*log_rows=${H_LOG_ROWS} (${H_ROWS_FMT} rows · ${H_COLS} cols · blowup=${H_BLOWUP} · ${H_QUERIES} queries)*"
+    P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Lambda:* $(p3_fmt_seconds "$H_LAMBDA_PROVE") prove · $(p3_fmt_seconds "$H_LAMBDA_VERIFY") verify · $(p3_fmt_mb "$H_LAMBDA_PROOF") proof · $(p3_fmt_gb "$H_LAMBDA_RSS") RSS"
+    P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS"
+    P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS"
+
+    # Render a `(label|file)` list into a multi-line mrkdwn block with
+    # `*label* Lambda Xs / P3 Ys — Rx` per row. Used by both sweep sections.
+    p3_render_sweep() {
+        local out=""
+        local entry label file lambda_t p3_t ratio ratio_fmt line
+        for entry in "$@"; do
+            label="${entry%%|*}"
+            file="${entry##*|}"
+            if [ ! -f "$file" ]; then
+                line="*${label}* (no data)"
+            else
+                lambda_t=$(p3_parse "$file" "lambda_prove_medians")
+                p3_t=$(p3_parse "$file" "p3_prove_medians")
+                ratio=$(p3_parse "$file" "ratios_lambda_over_p3")
+                ratio_fmt=$(LC_NUMERIC=C awk -v r="$ratio" 'BEGIN {
+                    if (r == "" || r == "n/a") { print "n/a"; exit }
+                    printf "%.2fx", r
+                }')
+                line="*${label}* Lambda $(p3_fmt_seconds "$lambda_t") / P3 $(p3_fmt_seconds "$p3_t") — ${ratio_fmt}"
+            fi
+            if [ -n "$out" ]; then
+                out="${out}\\n${line}"
+            else
+                out="$line"
+            fi
+        done
+        printf '%s' "$out"
+    }
+
+    P3_SIZE_MRKDWN=$(p3_render_sweep \
+        "log_rows=19|bench_vs_artifacts/p3/size_log19/metrics.txt" \
+        "log_rows=20|bench_vs_artifacts/p3/size_log20/metrics.txt" \
+        "log_rows=21|bench_vs_artifacts/p3/headline/metrics.txt")
+
+    P3_COLS_MRKDWN=$(p3_render_sweep \
+        "8 cols (n=4):|bench_vs_artifacts/p3/cols_n4/metrics.txt" \
+        "32 cols (n=16):|bench_vs_artifacts/p3/headline/metrics.txt" \
+        "128 cols (n=64):|bench_vs_artifacts/p3/cols_n64/metrics.txt")
+
+    P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Size scaling @ 32 cols"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_SIZE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Column scaling @ log_rows='"$H_LOG_ROWS"'"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_COLS_MRKDWN"'"}}'
+fi
+
 curl -X POST "$WEBHOOK_URL" \
     -H 'Content-Type: application/json; charset=utf-8' \
-    --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"divider"},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION"']}'
+    --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"divider"},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION$P3_SECTION"']}'
diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml
index 2118632f8..f27cdf356 100644
--- a/.github/workflows/bench-vs-nightly.yml
+++ b/.github/workflows/bench-vs-nightly.yml
@@ -47,6 +47,17 @@ jobs:
             --report-dir bench_vs_artifacts \
             --no-color
 
+      - name: Refresh Plonky3 to latest main
+        run: |
+          cargo update --manifest-path bench_vs_plonky3/Cargo.toml \
+            -p p3-air -p p3-field -p p3-goldilocks -p p3-matrix \
+            -p p3-commit -p p3-challenger -p p3-symmetric \
+            -p p3-merkle-tree -p p3-keccak -p p3-fri \
+            -p p3-uni-stark -p p3-dft
+
+      - name: Run Plonky3 nightly benchmark
+        run: bash ./bench_vs_plonky3/run_p3_nightly.sh bench_vs_artifacts/p3
+
       - name: Upload nightly benchmark artifact
         uses: actions/upload-artifact@v4
         with:
diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml
deleted file mode 100644
index 03fedad2b..000000000
--- a/.github/workflows/bench-vs-p3-nightly.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-name: Bench Vs Plonky3 Nightly
-
-on:
-  schedule:
-    # 04:30 America/Argentina/Buenos_Aires = 07:30 UTC
-    # SP1 nightly fires at 06:00 UTC (03:00 BA) and runs ~1.5h; scheduling 1.5h
-    # later leaves the self-hosted bench runner free.
-    - cron: "30 7 * * *"
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-concurrency:
-  group: bench-vs-p3-nightly-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  bench-vs-p3:
-    runs-on: [self-hosted, bench]
-    timeout-minutes: 60
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Setup Rust Environment
-        uses: ./.github/actions/setup-rust
-
-      - name: Add cargo to PATH
-        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
-
-      - name: Run nightly Plonky3 benchmark
-        run: |
-          bash ./bench_vs_plonky3/run.sh \
-            --log-rows 19 \
-            --num-sequences 16 \
-            --runs 10 \
-            --scalar \
-            --report-dir bench_vs_p3_artifacts \
-            --no-color
-
-      - name: Upload nightly benchmark artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: bench-vs-p3-nightly-${{ github.run_number }}-${{ github.sha }}
-          path: bench_vs_p3_artifacts
-          retention-days: 90
diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
index 92deaa31c..8fef10667 100644
--- a/bench_vs_plonky3/Cargo.toml
+++ b/bench_vs_plonky3/Cargo.toml
@@ -12,28 +12,18 @@ math = { path = "../crypto/math", features = [
     "lambdaworks-serde-binary",
 ] }
 
-# Plonky3: pinned to the yetanotherco fork, branch `feat/goldilocks_deg3`.
-# The branch adds BinomiallyExtendable<3> for Goldilocks (x^3 - 2), matching
-# Lambda's Degree3GoldilocksExtensionField. All p3-* crates MUST resolve to
-# the same git source + ref; declaring any of them as a crates.io dep would
-# pull in a second incompatible p3-field. cargo clones the fork once into
-# ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time.
-p3-air = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-field = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-goldilocks = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-matrix = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-commit = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-challenger = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-symmetric = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-keccak = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-fri = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" }
-p3-uni-stark = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
-    "parallel",
-] }
-p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [
-    "parallel",
-] }
+p3-air = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-field = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-commit = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-fri = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] }
+p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] }
 
 # Tracing for P3 span-based profiling
 tracing = "0.1"
@@ -49,8 +39,8 @@ criterion = { version = "0.4", default-features = false }
 # Both provers run multi-threaded by default: Plonky3's `Radix2DitParallel` DFT
 # uses rayon unconditionally, so Lambda must also enable `parallel` for a fair
 # apples-to-apples comparison. Disable with `--no-default-features` to compare
-# single-threaded. Cubic extension (`x^3 - 2`) matching Lambda is unconditional
-# — the fork ships `BinomiallyExtendable<3>` for Goldilocks natively.
+# single-threaded. The cubic extension is `x^3 - 2` (binomial) on Lambda and
+# `x^3 - x - 1` (trinomial) on upstream Plonky3 — same degree, same soundness.
 default = ["parallel"]
 parallel = ["stark/parallel"]
 instruments = ["stark/instruments"]
diff --git a/bench_vs_plonky3/run_p3_nightly.sh b/bench_vs_plonky3/run_p3_nightly.sh
new file mode 100755
index 000000000..495b0ed58
--- /dev/null
+++ b/bench_vs_plonky3/run_p3_nightly.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Orchestrates the Lambda-vs-Plonky3 nightly benchmark.
+#
+# Runs 5 configurations of run.sh into separate report-dirs under
+# `$REPORT_BASE`. The same 5 dirs are consumed by
+# `.github/scripts/publish_bench_vs.sh` to render the 3-section Slack post
+# (Headline + Size scaling + Column scaling).
+#
+# Usage:
+#   ./bench_vs_plonky3/run_p3_nightly.sh [REPORT_BASE]
+#
+# Defaults: REPORT_BASE=bench_vs_artifacts/p3
+#
+# Each run is 10 iterations × 2 provers; the 5 runs together take ~3 min on
+# the bench server.
+
+set -euo pipefail
+
+REPORT_BASE="${1:-bench_vs_artifacts/p3}"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+RUN_SH="$SCRIPT_DIR/run.sh"
+
+if [ ! -x "$RUN_SH" ]; then
+    echo "run.sh not found or not executable at $RUN_SH" >&2
+    exit 1
+fi
+
+run_one() {
+    local label=$1
+    local log_rows=$2
+    local num_sequences=$3
+    local out_dir="$REPORT_BASE/$label"
+    echo
+    echo "=== ${label} (log_rows=${log_rows}, num_sequences=${num_sequences}) ==="
+    bash "$RUN_SH" \
+        --log-rows "$log_rows" \
+        --num-sequences "$num_sequences" \
+        --runs 10 \
+        --scalar \
+        --report-dir "$out_dir" \
+        --no-color
+}
+
+# Size sweep + headline (32 cols).
+run_one size_log19 19 16
+run_one size_log20 20 16
+run_one headline   21 16
+
+# Column sweep @ log_rows=21.
+run_one cols_n4    21  4
+run_one cols_n64   21 64
diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs
index a266cac04..774009bba 100644
--- a/bench_vs_plonky3/src/plonky3_config.rs
+++ b/bench_vs_plonky3/src/plonky3_config.rs
@@ -1,7 +1,7 @@
 use p3_challenger::{HashChallenger, SerializingChallenger64};
 use p3_commit::ExtensionMmcs;
 use p3_dft::Radix2DitParallel;
-use p3_field::extension::BinomialExtensionField;
+use p3_field::extension::CubicTrinomialExtensionField;
 use p3_fri::{FriParameters, TwoAdicFriPcs};
 use p3_goldilocks::Goldilocks;
 use p3_keccak::{Keccak256Hash, KeccakF};
@@ -10,11 +10,7 @@ use p3_symmetric::{CompressionFunctionFromHasher, PaddingFreeSponge, Serializing
 use p3_uni_stark::StarkConfig;
 
 pub type Val = Goldilocks;
-
-/// Cubic extension matching Lambda's `Degree3GoldilocksExtensionField`
-/// (irreducible x^3 - 2). Provided by the forked `p3-goldilocks` via
-/// `BinomiallyExtendable<3>`.
-pub type Challenge = BinomialExtensionField<Val, 3>;
+pub type Challenge = CubicTrinomialExtensionField<Val>;
 
 type ByteHash = Keccak256Hash;
 type U64Hash = PaddingFreeSponge<KeccakF, 25, 17, 4>;

From 82db83b0e760ba82bfd98bab30d39d93ba1c1af1 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Mon, 18 May 2026 16:21:49 -0300
Subject: [PATCH 29/34] Trim Lambda-vs-Plonky3 nightly to headline-only (drop
 size and column sweeps)

---
 .github/scripts/publish_bench_vs.sh | 45 ++---------------------------
 bench_vs_plonky3/run_p3_nightly.sh  | 20 ++++---------
 2 files changed, 7 insertions(+), 58 deletions(-)

diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh
index f30dcce52..5e6128bfb 100644
--- a/.github/scripts/publish_bench_vs.sh
+++ b/.github/scripts/publish_bench_vs.sh
@@ -80,9 +80,7 @@ if [ -n "$LAMBDA_PROJECTED_H" ] || [ -n "$SP1_PROJECTED_H" ]; then
 fi
 
 # --- Plonky3 section (optional) --------------------------------------------
-# Built when `bench_vs_artifacts/p3/headline/metrics.txt` exists. The headline
-# row comes from that file; column-scaling rows are read from the per-N
-# subdirs written by the workflow.
+# Built when `bench_vs_artifacts/p3/headline/metrics.txt` exists.
 
 p3_parse() {
     local file=$1
@@ -156,46 +154,7 @@ if [ -f "$P3_HEADLINE_FILE" ]; then
     P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS"
     P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS"
 
-    # Render a `(label|file)` list into a multi-line mrkdwn block with
-    # `*label* Lambda Xs / P3 Ys — Rx` per row. Used by both sweep sections.
-    p3_render_sweep() {
-        local out=""
-        local entry label file lambda_t p3_t ratio ratio_fmt line
-        for entry in "$@"; do
-            label="${entry%%|*}"
-            file="${entry##*|}"
-            if [ ! -f "$file" ]; then
-                line="*${label}* (no data)"
-            else
-                lambda_t=$(p3_parse "$file" "lambda_prove_medians")
-                p3_t=$(p3_parse "$file" "p3_prove_medians")
-                ratio=$(p3_parse "$file" "ratios_lambda_over_p3")
-                ratio_fmt=$(LC_NUMERIC=C awk -v r="$ratio" 'BEGIN {
-                    if (r == "" || r == "n/a") { print "n/a"; exit }
-                    printf "%.2fx", r
-                }')
-                line="*${label}* Lambda $(p3_fmt_seconds "$lambda_t") / P3 $(p3_fmt_seconds "$p3_t") — ${ratio_fmt}"
-            fi
-            if [ -n "$out" ]; then
-                out="${out}\\n${line}"
-            else
-                out="$line"
-            fi
-        done
-        printf '%s' "$out"
-    }
-
-    P3_SIZE_MRKDWN=$(p3_render_sweep \
-        "log_rows=19|bench_vs_artifacts/p3/size_log19/metrics.txt" \
-        "log_rows=20|bench_vs_artifacts/p3/size_log20/metrics.txt" \
-        "log_rows=21|bench_vs_artifacts/p3/headline/metrics.txt")
-
-    P3_COLS_MRKDWN=$(p3_render_sweep \
-        "8 cols (n=4):|bench_vs_artifacts/p3/cols_n4/metrics.txt" \
-        "32 cols (n=16):|bench_vs_artifacts/p3/headline/metrics.txt" \
-        "128 cols (n=64):|bench_vs_artifacts/p3/cols_n64/metrics.txt")
-
-    P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Size scaling @ 32 cols"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_SIZE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Column scaling @ log_rows='"$H_LOG_ROWS"'"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_COLS_MRKDWN"'"}}'
+    P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}}'
 fi
 
 curl -X POST "$WEBHOOK_URL" \
diff --git a/bench_vs_plonky3/run_p3_nightly.sh b/bench_vs_plonky3/run_p3_nightly.sh
index 495b0ed58..47957b6ff 100755
--- a/bench_vs_plonky3/run_p3_nightly.sh
+++ b/bench_vs_plonky3/run_p3_nightly.sh
@@ -1,18 +1,15 @@
 #!/usr/bin/env bash
 # Orchestrates the Lambda-vs-Plonky3 nightly benchmark.
 #
-# Runs 5 configurations of run.sh into separate report-dirs under
-# `$REPORT_BASE`. The same 5 dirs are consumed by
-# `.github/scripts/publish_bench_vs.sh` to render the 3-section Slack post
-# (Headline + Size scaling + Column scaling).
+# Runs the headline configuration (log_rows=21, num_sequences=16 → 32 cols)
+# into `$REPORT_BASE/headline/`. Consumed by
+# `.github/scripts/publish_bench_vs.sh` to render the Headline section of
+# the Slack post.
 #
 # Usage:
 #   ./bench_vs_plonky3/run_p3_nightly.sh [REPORT_BASE]
 #
 # Defaults: REPORT_BASE=bench_vs_artifacts/p3
-#
-# Each run is 10 iterations × 2 provers; the 5 runs together take ~3 min on
-# the bench server.
 
 set -euo pipefail
 
@@ -41,11 +38,4 @@ run_one() {
         --no-color
 }
 
-# Size sweep + headline (32 cols).
-run_one size_log19 19 16
-run_one size_log20 20 16
-run_one headline   21 16
-
-# Column sweep @ log_rows=21.
-run_one cols_n4    21  4
-run_one cols_n64   21 64
+run_one headline 21 16

From c84a12cce896d5c062a03b24ceab9e65e98a6cfc Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 19 May 2026 10:06:51 -0300
Subject: [PATCH 30/34] fix bench

---
 bench_vs_plonky3/Cargo.toml                   | 27 +++---
 bench_vs_plonky3/README.md                    | 84 +++++++++++++++----
 bench_vs_plonky3/run.sh                       | 48 +++++++++--
 bench_vs_plonky3/src/bin/prove_bench.rs       | 56 +++++++++++++
 bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 38 +++++++++
 bench_vs_plonky3/src/plonky3_config.rs        | 38 +++++----
 6 files changed, 237 insertions(+), 54 deletions(-)

diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
index 8fef10667..26bb49cc1 100644
--- a/bench_vs_plonky3/Cargo.toml
+++ b/bench_vs_plonky3/Cargo.toml
@@ -12,18 +12,21 @@ math = { path = "../crypto/math", features = [
     "lambdaworks-serde-binary",
 ] }
 
-p3-air = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-field = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-commit = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-fri = { git = "https://github.com/Plonky3/Plonky3.git" }
-p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] }
-p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] }
+# Pinned to the commit currently resolved in Cargo.lock so the benchmark is
+# reproducible against an exact P3 revision. Bumping is fine; it must be an
+# explicit decision, not the result of an unrelated `cargo update`.
+p3-air = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-field = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-commit = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-fri = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
+p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] }
+p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] }
 
 # Tracing for P3 span-based profiling
 tracing = "0.1"
diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md
index 066582280..a78249f31 100644
--- a/bench_vs_plonky3/README.md
+++ b/bench_vs_plonky3/README.md
@@ -35,7 +35,7 @@ test.
 ## Usage
 
 ```bash
-# Default: log-rows=19, num-sequences=16, runs=3, cubic extension, no scalar
+# Default: log-rows=19, num-sequences=16, runs=10, cubic extension, no scalar
 ./bench_vs_plonky3/run.sh
 
 # Size sweep
@@ -58,10 +58,10 @@ test.
 |---|---|---|
 | `--log-rows K [K ...]` | `19` | One or more power-of-2 row counts. |
 | `--num-sequences N` | `16` | Number of Fibonacci sequences (columns = `2 × N`). |
-| `--runs N` | `3` | Runs per `(size, prover)`; median is reported. |
+| `--runs N` | `10` | Runs per `(size, prover)`; median + CV are reported. |
 | `--lambda-only` / `--p3-only` | both | Restrict to a single prover. |
-| `--report-dir DIR` | — | Write TSV + metrics + raw stdouts. |
-| `--scalar` | off | Pin `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` so Goldilocks (and most of Keccak) run scalar on both sides. x86_64 only; on other archs the flag is ignored with a warning. Residual SSE2 on `p3-keccak` remains (~7% of total prove time). |
+| `--report-dir DIR` | — | Write TSV + metrics + raw stdouts + raw audits. |
+| `--scalar` | off | Pin `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` so Goldilocks field arithmetic runs scalar on both sides. x86_64 only; on other archs the flag is ignored with a warning. The MMCS is already scalar regardless of this flag (see [P3 config: scalar MMCS](#p3-config-scalar-mmcs)). |
 | `--no-color` | off | Disable ANSI colors. |
 | `-h` / `--help` | — | Print usage. |
 
@@ -73,8 +73,10 @@ Stdout (without `--report-dir`):
 === STARK prove benchmark: Lambda vs Plonky3 ===
   log-rows:       19
   num-sequences:  16  (columns = 32)
-  runs/size:      3  (median reported)
-  p3 extension:   degree 3 (forked p3-goldilocks, matches Lambda)
+  runs/size:      10  (median + CV reported)
+  p3 extension:   upstream CubicTrinomialExtensionField (x^3 - x - 1)
+  p3 mmcs:        scalar Keccak256 (val_packing_width=1, hash_lanes=1)
+  proof params:   blowup=2, queries=219, grinding=0
   scalar mode:    on  (arch=x86_64, RUSTFLAGS="-C target-feature=-avx2,-avx512f")
 
 [build] prove_bench
@@ -95,9 +97,16 @@ With `--report-dir DIR` the script writes:
 
 - `results.tsv` — tab-separated raw data (`log_rows, rows, lambda_median_s,
   p3_median_s, ratio_lambda_over_p3, runs`).
+- `raw_metrics.tsv` — one row per `(prover, log_rows, run)` with all
+  `METRICS` fields parsed out.
+- `raw_audits.tsv` — one row per `(prover, log_rows, run)` with the AUDIT
+  line emitted by `prove_bench` before each prove call. Lets you confirm in
+  retrospect that `val_packing_width=1`, `hash_lanes=1`,
+  `base_transition_constraints=2×num_sequences`, etc. Don't trust a number
+  without skimming this file.
 - `metrics.txt` — key=value pairs with the config used (arch, scalar flag,
-  extension degree, blowup, queries, runs, rustflags) and the per-series
-  values slash-joined (so post-processing scripts can split easily).
+  extension, mmcs choice, blowup, queries, runs, rustflags) and the
+  per-series values slash-joined (so post-processing scripts can split easily).
 - `raw/` — per-invocation stdouts (`{prover}_log{K}_run{i}.stdout`).
 
 No markdown file is generated — the TSV is the single source of truth for
@@ -156,18 +165,59 @@ cargo test -p bench-vs-plonky3 --features instruments --release -- \
 The nightly does **not** activate this path — it would add ~1 % overhead and
 pollute the historical wall-clock numbers.
 
+## P3 config: scalar MMCS
+
+`plonky3_config.rs` sets up the P3 stark config with a deliberately
+**non-production** MMCS:
+
+```rust
+type ByteHash = Keccak256Hash;                               // tiny_keccak scalar
+type FieldHash = SerializingHasher<ByteHash>;
+type MyCompress = CompressionFunctionFromHasher<ByteHash, 2, 32>;
+pub type ValMmcs = MerkleTreeMmcs<Val, u8, FieldHash, MyCompress, 2, 32>;
+```
+
+The Plonky3 default for Goldilocks MMCS uses `PaddingFreeSponge<KeccakF, 25,
+17, 4>` with leaves `[Val; VECTOR_LEN]` and digests `[u64; VECTOR_LEN]`,
+where `VECTOR_LEN` is set at compile-time per arch: NEON=2, AVX-512=8,
+AVX2=4, SSE2=2, fallback=1. That gives Plonky3 a free `N×` Keccak speedup
+on every Merkle node — which Lambda's `sha3::Keccak256` cannot exploit
+because the Lambda MMCS hashes a single input at a time.
+
+The scalar config here makes both sides hash one input per Keccak call.
+Both still use the **same Keccak-f[1600] permutation** (capacity 512, rate
+1088, 256-bit output, Keccak-original 0x01 padding); the only thing
+removed is data-parallel lanes on the P3 side. Consequence: the ratio
+published by this bench is **apples-to-apples scalar**, not "Plonky3 as
+shipped in production." If you want the production-realistic P3 number,
+swap the MMCS back to the vector-lane variant from upstream's examples.
+
+On aarch64 with `feature="asm"` enabled in `crypto/crypto`, Lambda's
+`sha3::Keccak256` uses ARMv8 SHA3 intrinsics, which speeds up *one* Keccak
+call (no data parallelism). `tiny_keccak`'s `Keccak256Hash` on P3 is pure
+Rust and gets no such acceleration. On x86_64 server, neither side has
+that path, so the comparison is cleanest there.
+
 ## Notes on fairness
 
-- **Extension field**: Plonky3 runs `BinomialExtensionField<Goldilocks, 3>`
-  with the same `x^3 - 2` irreducible as Lambda's
-  `Degree3GoldilocksExtensionField`. Both sides use the same cubic extension.
+- **Extension field**: Plonky3 runs upstream `CubicTrinomialExtensionField`
+  over Goldilocks (`x^3 - x - 1`); Lambda runs `Degree3GoldilocksExtensionField`
+  (`x^3 - 2`). Both are degree-3 irreducible extensions of `GF(p)` with the
+  same field size and the same soundness. Cell-by-cell trace equivalence is
+  asserted by `lambda_pair_trace_matches_plonky3_trace`.
 - **Parallelism**: both provers are multi-threaded by default. Lambda pulls
-  rayon via `stark/parallel`; Plonky3 pulls rayon via
-  `p3-uni-stark` / `p3-dft` (hardcoded `features = ["parallel"]`, always on).
-- **SIMD**: without `--scalar`, each side uses whatever target-features the
-  compiler decides from the host CPU. `--scalar` (x86_64 only) disables AVX2
-  and AVX-512 so Goldilocks arithmetic is scalar on both sides. `p3-keccak`'s
-  SSE2 path on x86 is not disabled.
+  rayon via `stark/parallel`; Plonky3 pulls rayon via `p3-uni-stark` /
+  `p3-dft` (hardcoded `features = ["parallel"]`, always on).
+- **SIMD**: the MMCS Keccak is scalar on both sides (see above). For
+  Goldilocks field arithmetic, without `--scalar` each side uses whatever
+  target-features the compiler decides from the host CPU. `--scalar`
+  (x86_64 only) disables AVX2 / AVX-512.
+- **AIR base-field path**: the Lambda AIR overrides
+  `num_base_transition_constraints` and implements `evaluate_prover` so its
+  Fibonacci transition constraints are evaluated in the base field (F×E,
+  ≈3 muls/term) instead of the default extension path (E×E, ≈9 muls/term).
+  This matches what the production Lambda STARK does for all
+  domain-constraint AIRs.
 - **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both
   sides. Security models differ (Lambda: Johnson-bound, ~108 bits proven;
   P3: conjectured, 219 queries × 1 bit = 219 bits, capped at 192 by the
diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index 0098fed33..20f1f6331 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -10,9 +10,10 @@
 # Defaults: --log-rows 19, --num-sequences 16, --runs 10.
 # With multiple --log-rows values, prints one stats row per size.
 #
-# --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks (and most of Keccak)
-# run scalar; residual SSE2 in p3-keccak remains. Triggers a rebuild when
-# toggling; subsequent runs with the same RUSTFLAGS are cached.
+# --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks runs scalar. The MMCS
+# itself is already scalar (single-input tiny_keccak via Keccak256Hash) regardless
+# of this flag — its SIMD lanes were removed in the config. Triggers a rebuild
+# when toggling; subsequent runs with the same RUSTFLAGS are cached.
 
 set -euo pipefail
 
@@ -123,10 +124,9 @@ if [ -n "$REPORT_DIR" ]; then
 fi
 
 # --- Scalar (no SIMD) toggle ------------------------------------------------
-# When --scalar is on, disable AVX2/AVX-512 so Goldilocks (and most of Keccak)
-# run scalar for an apples-to-apples comparison against Lambda STARK. The
-# residual SSE2 path on p3-keccak is intentionally left enabled — its
-# contribution to total prove time is ~7%.
+# When --scalar is on, disable AVX2/AVX-512 so Goldilocks field arithmetic runs
+# scalar for an apples-to-apples comparison against Lambda STARK. The MMCS Keccak
+# is already scalar regardless of this flag (see plonky3_config.rs).
 # Cargo caches per-RUSTFLAGS, so toggling scalar vs vector triggers a rebuild
 # on first use but is cached afterwards.
 SCALAR_RUSTFLAGS=""
@@ -155,7 +155,8 @@ echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}"
 echo -e "  log-rows:       ${YELLOW}${LOG_ROWS[*]}${NC}"
 echo -e "  num-sequences:  ${YELLOW}${NUM_SEQUENCES}${NC}  (columns = $((2 * NUM_SEQUENCES)))"
 echo -e "  runs/size:      ${YELLOW}${RUNS}${NC}  (median + CV reported)"
-echo -e "  p3 extension:   ${YELLOW}degree 3 (forked p3-goldilocks, matches Lambda)${NC}"
+echo -e "  p3 extension:   ${YELLOW}upstream CubicTrinomialExtensionField (x^3 - x - 1)${NC}"
+echo -e "  p3 mmcs:        ${YELLOW}scalar Keccak256 (val_packing_width=1, hash_lanes=1)${NC}"
 echo -e "  proof params:   ${YELLOW}blowup=${BLOWUP}, queries=${FRI_QUERIES}, grinding=${GRINDING}${NC}"
 if $BREAKDOWN; then
     echo -e "  breakdown:      ${YELLOW}on${NC}  (Lambda instruments + P3 tracing spans)"
@@ -206,6 +207,13 @@ extract_metrics_line() {
     }'
 }
 
+extract_audit_line() {
+    sed -n '/^AUDIT	/ {
+        p
+        q
+    }'
+}
+
 metric_value() {
     local line=$1
     local key=$2
@@ -334,8 +342,10 @@ run_prover() {
     local log_rows=$2
     local times=()
     local metrics_file="$TMP_DIR/${prover}_${log_rows}.metrics"
+    local audit_file="$TMP_DIR/${prover}_${log_rows}.audits"
     local breakdown_file="$TMP_DIR/${prover}_${log_rows}.breakdown"
     : > "$metrics_file"
+    : > "$audit_file"
     : > "$breakdown_file"
     for run_i in $(seq 1 "$RUNS"); do
         local out_file="$TMP_DIR/${prover}_${log_rows}_${run_i}.stdout"
@@ -348,6 +358,11 @@ run_prover() {
             cat "$out_file"
             exit 1
         fi
+        local audit_line
+        audit_line=$(extract_audit_line < "$out_file")
+        if [ -n "$audit_line" ]; then
+            printf 'run=%s\t%s\n' "$run_i" "$audit_line" >> "$audit_file"
+        fi
         local metrics_line
         metrics_line=$(extract_metrics_line < "$out_file")
         if [ -z "$metrics_line" ]; then
@@ -537,6 +552,20 @@ if [ -n "$REPORT_DIR" ]; then
         done
     } > "$REPORT_DIR/raw_metrics.tsv"
 
+    # Raw AUDIT lines per run, one row per prover×log_rows×run. Lets the reader
+    # confirm in retrospect that val_packing_width=1, hash_lanes=1, etc.
+    {
+        printf "run\taudit_line\n"
+        for lr in "${RESULT_LOG_ROWS[@]}"; do
+            for prover in lambda p3; do
+                audit_file="$TMP_DIR/${prover}_${lr}.audits"
+                if [ -f "$audit_file" ]; then
+                    cat "$audit_file"
+                fi
+            done
+        done
+    } > "$REPORT_DIR/raw_audits.tsv"
+
     if $BREAKDOWN; then
         {
             printf "run\tworkload\tprover\tlog_rows\trows\tphase\tms\ttable\ttable_rows\tspan\n"
@@ -588,7 +617,8 @@ if [ -n "$REPORT_DIR" ]; then
         else
             echo "breakdown=off"
         fi
-        echo "p3_extension=degree3_fork"
+        echo "p3_extension=upstream_cubic_trinomial"
+        echo "p3_mmcs=scalar_keccak256"
         if $SCALAR_ACTIVE; then
             echo "scalar=on"
             echo "rustflags=$SCALAR_RUSTFLAGS"
diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs
index 66d9baacd..b71b2ce4e 100644
--- a/bench_vs_plonky3/src/bin/prove_bench.rs
+++ b/bench_vs_plonky3/src/bin/prove_bench.rs
@@ -520,6 +520,60 @@ fn run_p3(args: &Args) -> BenchMetrics {
     }
 }
 
+fn print_audit(args: &Args) {
+    let prover_name = match args.prover {
+        ProverKind::Lambda => "lambda",
+        ProverKind::P3 => "p3",
+    };
+    let rows = 1usize << args.log_rows;
+    let main_cols = 2 * args.num_sequences;
+    let trace_cells = rows * main_cols;
+    let public_values = 2 * args.num_sequences;
+    let transition_constraints = 2 * args.num_sequences;
+
+    // Common prefix.
+    let common = format!(
+        "AUDIT\tprover={prover_name}\tworkload=fib_pair\tlog_rows={}\trows={rows}\t\
+         main_cols={main_cols}\taux_cols=0\ttrace_cells={trace_cells}\t\
+         public_values={public_values}",
+        args.log_rows,
+    );
+
+    // Per-prover audit fields.
+    let prover_specific = match args.prover {
+        ProverKind::Lambda => format!(
+            "transition_constraints={transition_constraints}\t\
+             base_transition_constraints={transition_constraints}\t\
+             boundary_constraints={transition_constraints}\t\
+             composition_chunks=1"
+        ),
+        ProverKind::P3 => {
+            // P3 counts 2*num_sequences first-row constraints (boundary equivalent,
+            // encoded inside the AIR via `when_first_row`) + 2*num_sequences
+            // transition constraints, total 4*num_sequences.
+            let air_constraints = 4 * args.num_sequences;
+            let first_row_constraints = 2 * args.num_sequences;
+            format!(
+                "air_constraints={air_constraints}\t\
+                 first_row_constraints={first_row_constraints}\t\
+                 transition_constraints={transition_constraints}\t\
+                 boundary_constraints=0\tquotient_chunks=1\t\
+                 val_packing_width={}\thash_lanes={}",
+                plonky3_config::VAL_PACKING_WIDTH,
+                plonky3_config::HASH_LANES,
+            )
+        }
+    };
+
+    let tail = format!(
+        "blowup={}\tqueries={}\tgrinding={}\t\
+         trace_generation_timed=false\tverify_in_ratio=false",
+        args.blowup, args.queries, args.grinding,
+    );
+
+    println!("{common}\t{prover_specific}\t{tail}");
+}
+
 fn main() -> ExitCode {
     let args = match parse_args() {
         Ok(a) => a,
@@ -530,6 +584,8 @@ fn main() -> ExitCode {
         }
     };
 
+    print_audit(&args);
+
     let metrics = match args.prover {
         ProverKind::Lambda => run_lambda(&args),
         ProverKind::P3 => run_p3(&args),
diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
index 9c1ca6024..60596bddc 100644
--- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
+++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
@@ -104,6 +104,23 @@ where
             }
         }
     }
+
+    fn evaluate_prover(
+        &self,
+        eval_ctx: &TransitionEvaluationContext<F, E>,
+        base_evals: &mut [FieldElement<F>],
+        _ext_evals: &mut [FieldElement<E>],
+    ) {
+        let TransitionEvaluationContext::Prover { frame, .. } = eval_ctx else {
+            unreachable!("evaluate_prover called with non-Prover context");
+        };
+        let s0 = frame.get_evaluation_step(0);
+        let s1 = frame.get_evaluation_step(1);
+        let local_left = s0.get_main_evaluation_element(0, 2 * self.seq_idx);
+        let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+        let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx);
+        base_evals[self.constraint_idx] = next_left - local_left - local_right;
+    }
 }
 
 /// `next.right = local.right + next.left`
@@ -177,6 +194,23 @@ where
             }
         }
     }
+
+    fn evaluate_prover(
+        &self,
+        eval_ctx: &TransitionEvaluationContext<F, E>,
+        base_evals: &mut [FieldElement<F>],
+        _ext_evals: &mut [FieldElement<E>],
+    ) {
+        let TransitionEvaluationContext::Prover { frame, .. } = eval_ctx else {
+            unreachable!("evaluate_prover called with non-Prover context");
+        };
+        let s0 = frame.get_evaluation_step(0);
+        let s1 = frame.get_evaluation_step(1);
+        let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+        let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx);
+        let next_right = s1.get_main_evaluation_element(0, 2 * self.seq_idx + 1);
+        base_evals[self.constraint_idx] = next_right - local_right - next_left;
+    }
 }
 
 /// Public inputs: initial `(a, b) = (left, right)` pair for each sequence.
@@ -226,6 +260,10 @@ where
         &self.constraints
     }
 
+    fn num_base_transition_constraints(&self) -> usize {
+        2 * self.num_sequences
+    }
+
     fn boundary_constraints(
         &self,
         pub_inputs: &Self::PublicInputs,
diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs
index 774009bba..d0ead2657 100644
--- a/bench_vs_plonky3/src/plonky3_config.rs
+++ b/bench_vs_plonky3/src/plonky3_config.rs
@@ -4,26 +4,24 @@ use p3_dft::Radix2DitParallel;
 use p3_field::extension::CubicTrinomialExtensionField;
 use p3_fri::{FriParameters, TwoAdicFriPcs};
 use p3_goldilocks::Goldilocks;
-use p3_keccak::{Keccak256Hash, KeccakF};
+use p3_keccak::Keccak256Hash;
 use p3_merkle_tree::MerkleTreeMmcs;
-use p3_symmetric::{CompressionFunctionFromHasher, PaddingFreeSponge, SerializingHasher};
+use p3_symmetric::{CompressionFunctionFromHasher, SerializingHasher};
 use p3_uni_stark::StarkConfig;
 
 pub type Val = Goldilocks;
 pub type Challenge = CubicTrinomialExtensionField<Val>;
 
+// Scalar byte-oriented MMCS, deliberately not the Plonky3 production config.
+// Leaves are individual field elements, digests are 32 raw bytes, and the
+// underlying Keccak path is single-input tiny_keccak. This removes the
+// `[Val; VECTOR_LEN]` / `[u64; VECTOR_LEN]` Keccak lanes that the
+// vector-friendly upstream config uses (NEON=2, SSE2=2, AVX2=4, AVX-512=8),
+// so the Merkle compression cost is one Keccak-f per call on both sides.
 type ByteHash = Keccak256Hash;
-type U64Hash = PaddingFreeSponge<KeccakF, 25, 17, 4>;
-type FieldHash = SerializingHasher<U64Hash>;
-type MyCompress = CompressionFunctionFromHasher<U64Hash, 2, 4>;
-pub type ValMmcs = MerkleTreeMmcs<
-    [Val; p3_keccak::VECTOR_LEN],
-    [u64; p3_keccak::VECTOR_LEN],
-    FieldHash,
-    MyCompress,
-    2,
-    4,
->;
+type FieldHash = SerializingHasher<ByteHash>;
+type MyCompress = CompressionFunctionFromHasher<ByteHash, 2, 32>;
+pub type ValMmcs = MerkleTreeMmcs<Val, u8, FieldHash, MyCompress, 2, 32>;
 type ChallengeMmcs = ExtensionMmcs<Val, Challenge, ValMmcs>;
 type Dft = Radix2DitParallel<Val>;
 pub type Pcs = TwoAdicFriPcs<Val, Dft, ValMmcs, ChallengeMmcs>;
@@ -31,11 +29,19 @@ pub type Challenger = SerializingChallenger64<Val, HashChallenger<u8, ByteHash,
 
 pub type P3Config = StarkConfig<Pcs, Challenge, Challenger>;
 
+/// Packing width of the MMCS leaves (`P` parameter of `MerkleTreeMmcs`).
+/// `Val` directly = 1; `[Val; N]` would be `N`. Exposed for the AUDIT line.
+pub const VAL_PACKING_WIDTH: usize = 1;
+
+/// Lanes of the underlying Keccak permutation as seen by the MMCS.
+/// `Keccak256Hash` is single-input scalar; lane-vectorized `KeccakF` paths
+/// would set this to 2/4/8 depending on arch.
+pub const HASH_LANES: usize = 1;
+
 fn build_mmcs() -> (ValMmcs, ChallengeMmcs, ByteHash) {
     let byte_hash = ByteHash {};
-    let u64_hash = U64Hash::new(KeccakF {});
-    let field_hash = FieldHash::new(u64_hash);
-    let compress = MyCompress::new(u64_hash);
+    let field_hash = FieldHash::new(byte_hash);
+    let compress = MyCompress::new(byte_hash);
     let val_mmcs = ValMmcs::new(field_hash, compress, 3);
     let challenge_mmcs = ChallengeMmcs::new(val_mmcs.clone());
     (val_mmcs, challenge_mmcs, byte_hash)

From e9d5abb889958c9cc40e02d8d5f0729cf36f9b5f Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 19 May 2026 12:15:53 -0300
Subject: [PATCH 31/34] Refresh Plonky3 to latest main

---
 bench_vs_plonky3/Cargo.toml | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml
index 26bb49cc1..8fef10667 100644
--- a/bench_vs_plonky3/Cargo.toml
+++ b/bench_vs_plonky3/Cargo.toml
@@ -12,21 +12,18 @@ math = { path = "../crypto/math", features = [
     "lambdaworks-serde-binary",
 ] }
 
-# Pinned to the commit currently resolved in Cargo.lock so the benchmark is
-# reproducible against an exact P3 revision. Bumping is fine; it must be an
-# explicit decision, not the result of an unrelated `cargo update`.
-p3-air = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-field = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-commit = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-fri = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" }
-p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] }
-p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] }
+p3-air = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-field = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-commit = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-fri = { git = "https://github.com/Plonky3/Plonky3.git" }
+p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] }
+p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] }
 
 # Tracing for P3 span-based profiling
 tracing = "0.1"

From 18f1f0c7cc173b7002ec9fc93211c2d1a6921306 Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Tue, 19 May 2026 15:49:59 -0300
Subject: [PATCH 32/34] cleanup

---
 .github/scripts/publish_bench_vs.sh    | 42 +++++++++++++-------------
 .github/workflows/bench-vs-nightly.yml |  5 ++-
 bench_vs_plonky3/run_p3_nightly.sh     | 41 -------------------------
 3 files changed, 25 insertions(+), 63 deletions(-)
 delete mode 100755 bench_vs_plonky3/run_p3_nightly.sh

diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh
index 01e80ee0c..f96181e8c 100644
--- a/.github/scripts/publish_bench_vs.sh
+++ b/.github/scripts/publish_bench_vs.sh
@@ -118,22 +118,22 @@ p3_fmt_ratio_pair() {
 }
 
 P3_SECTION=""
-P3_HEADLINE_FILE="bench_vs_artifacts/p3/headline/metrics.txt"
-if [ -f "$P3_HEADLINE_FILE" ]; then
-    H_LOG_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "log_rows_series")
-    H_COLS=$(p3_parse "$P3_HEADLINE_FILE" "columns")
-    H_BLOWUP=$(p3_parse "$P3_HEADLINE_FILE" "blowup")
-    H_QUERIES=$(p3_parse "$P3_HEADLINE_FILE" "fri_queries")
-    H_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "rows_series")
-    H_LAMBDA_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "lambda_prove_medians")
-    H_P3_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "p3_prove_medians")
-    H_LAMBDA_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "lambda_verify_medians")
-    H_P3_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "p3_verify_medians")
-    H_LAMBDA_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "lambda_proof_size_medians")
-    H_P3_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "p3_proof_size_medians")
-    H_LAMBDA_RSS=$(p3_parse "$P3_HEADLINE_FILE" "lambda_peak_rss_medians")
-    H_P3_RSS=$(p3_parse "$P3_HEADLINE_FILE" "p3_peak_rss_medians")
-    H_RATIO=$(p3_parse "$P3_HEADLINE_FILE" "ratios_lambda_over_p3")
+P3_FILE="bench_vs_artifacts/p3/metrics.txt"
+if [ -f "$P3_FILE" ]; then
+    H_LOG_ROWS=$(p3_parse "$P3_FILE" "log_rows_series")
+    H_COLS=$(p3_parse "$P3_FILE" "columns")
+    H_BLOWUP=$(p3_parse "$P3_FILE" "blowup")
+    H_QUERIES=$(p3_parse "$P3_FILE" "fri_queries")
+    H_ROWS=$(p3_parse "$P3_FILE" "rows_series")
+    H_LAMBDA_PROVE=$(p3_parse "$P3_FILE" "lambda_prove_medians")
+    H_P3_PROVE=$(p3_parse "$P3_FILE" "p3_prove_medians")
+    H_LAMBDA_VERIFY=$(p3_parse "$P3_FILE" "lambda_verify_medians")
+    H_P3_VERIFY=$(p3_parse "$P3_FILE" "p3_verify_medians")
+    H_LAMBDA_PROOF=$(p3_parse "$P3_FILE" "lambda_proof_size_medians")
+    H_P3_PROOF=$(p3_parse "$P3_FILE" "p3_proof_size_medians")
+    H_LAMBDA_RSS=$(p3_parse "$P3_FILE" "lambda_peak_rss_medians")
+    H_P3_RSS=$(p3_parse "$P3_FILE" "p3_peak_rss_medians")
+    H_RATIO=$(p3_parse "$P3_FILE" "ratios_lambda_over_p3")
 
     H_ROWS_FMT=$(LC_NUMERIC=C awk -v r="$H_ROWS" 'BEGIN {
         if (r == "") { print "n/a"; exit }
@@ -149,12 +149,12 @@ if [ -f "$P3_HEADLINE_FILE" ]; then
         printf "%.2fx", r
     }')
 
-    P3_HEADLINE_MRKDWN="*log_rows=${H_LOG_ROWS} (${H_ROWS_FMT} rows · ${H_COLS} cols · blowup=${H_BLOWUP} · ${H_QUERIES} queries)*"
-    P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Lambda:* $(p3_fmt_seconds "$H_LAMBDA_PROVE") prove · $(p3_fmt_seconds "$H_LAMBDA_VERIFY") verify · $(p3_fmt_mb "$H_LAMBDA_PROOF") proof · $(p3_fmt_gb "$H_LAMBDA_RSS") RSS"
-    P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS"
-    P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS"
+    P3_MRKDWN="*log_rows=${H_LOG_ROWS} (${H_ROWS_FMT} rows · ${H_COLS} cols · blowup=${H_BLOWUP} · ${H_QUERIES} queries)*"
+    P3_MRKDWN="${P3_MRKDWN}\\n*Lambda:* $(p3_fmt_seconds "$H_LAMBDA_PROVE") prove · $(p3_fmt_seconds "$H_LAMBDA_VERIFY") verify · $(p3_fmt_mb "$H_LAMBDA_PROOF") proof · $(p3_fmt_gb "$H_LAMBDA_RSS") RSS"
+    P3_MRKDWN="${P3_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS"
+    P3_MRKDWN="${P3_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS"
 
-    P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}}'
+    P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_MRKDWN"'"}}'
 fi
 
 ETHREX_METRICS_FILE="bench_vs_artifacts/ethrex_metrics.txt"
diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml
index 152ce95a1..4d21a0a31 100644
--- a/.github/workflows/bench-vs-nightly.yml
+++ b/.github/workflows/bench-vs-nightly.yml
@@ -71,7 +71,10 @@ jobs:
             -p p3-uni-stark -p p3-dft
 
       - name: Run Plonky3 nightly benchmark
-        run: bash ./bench_vs_plonky3/run_p3_nightly.sh bench_vs_artifacts/p3
+        run: |
+          bash ./bench_vs_plonky3/run.sh \
+            --log-rows 21 --num-sequences 16 --runs 10 --scalar \
+            --report-dir bench_vs_artifacts/p3 --no-color
 
       - name: Upload nightly benchmark artifact
         uses: actions/upload-artifact@v4
diff --git a/bench_vs_plonky3/run_p3_nightly.sh b/bench_vs_plonky3/run_p3_nightly.sh
deleted file mode 100755
index 47957b6ff..000000000
--- a/bench_vs_plonky3/run_p3_nightly.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-# Orchestrates the Lambda-vs-Plonky3 nightly benchmark.
-#
-# Runs the headline configuration (log_rows=21, num_sequences=16 → 32 cols)
-# into `$REPORT_BASE/headline/`. Consumed by
-# `.github/scripts/publish_bench_vs.sh` to render the Headline section of
-# the Slack post.
-#
-# Usage:
-#   ./bench_vs_plonky3/run_p3_nightly.sh [REPORT_BASE]
-#
-# Defaults: REPORT_BASE=bench_vs_artifacts/p3
-
-set -euo pipefail
-
-REPORT_BASE="${1:-bench_vs_artifacts/p3}"
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-RUN_SH="$SCRIPT_DIR/run.sh"
-
-if [ ! -x "$RUN_SH" ]; then
-    echo "run.sh not found or not executable at $RUN_SH" >&2
-    exit 1
-fi
-
-run_one() {
-    local label=$1
-    local log_rows=$2
-    local num_sequences=$3
-    local out_dir="$REPORT_BASE/$label"
-    echo
-    echo "=== ${label} (log_rows=${log_rows}, num_sequences=${num_sequences}) ==="
-    bash "$RUN_SH" \
-        --log-rows "$log_rows" \
-        --num-sequences "$num_sequences" \
-        --runs 10 \
-        --scalar \
-        --report-dir "$out_dir" \
-        --no-color
-}
-
-run_one headline 21 16

From bb7d8baad3e7d71088c9071850c34d0f0fca5d2e Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Wed, 20 May 2026 11:04:55 -0300
Subject: [PATCH 33/34] address comments

---
 .github/scripts/publish_bench_vs.sh           |  4 +-
 bench_vs_plonky3/README.md                    | 42 +++++----
 bench_vs_plonky3/run.sh                       | 32 +------
 bench_vs_plonky3/src/bin/prove_bench.rs       | 78 ++---------------
 bench_vs_plonky3/src/lambda_fibonacci_pair.rs |  7 ++
 bench_vs_plonky3/src/lib.rs                   | 87 +------------------
 6 files changed, 48 insertions(+), 202 deletions(-)

diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh
index f96181e8c..d1585cd9f 100644
--- a/.github/scripts/publish_bench_vs.sh
+++ b/.github/scripts/publish_bench_vs.sh
@@ -13,7 +13,7 @@ METRICS_FILE="bench_vs_artifacts/metrics.txt"
 if [ ! -f "$METRICS_FILE" ]; then
     curl -X POST "$WEBHOOK_URL" \
         -H 'Content-Type: application/json; charset=utf-8' \
-        --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"section","text":{"type":"mrkdwn","text":":x: Benchmark failed - no metrics found. Check the workflow logs."}}]}'
+        --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM Nightly Benchmark"}},{"type":"section","text":{"type":"mrkdwn","text":":x: Benchmark failed - no metrics found. Check the workflow logs."}}]}'
     exit 0
 fi
 
@@ -173,4 +173,4 @@ fi
 
 curl -X POST "$WEBHOOK_URL" \
     -H 'Content-Type: application/json; charset=utf-8' \
-    --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"context","elements":[{"type":"mrkdwn","text":"*Program:* Fibonacci  ·  *Device:* CPU"}]},{"type":"divider"},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION$ETHREX_SECTION$P3_SECTION"']}'
+    --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM Nightly Benchmark"}},{"type":"context","elements":[{"type":"mrkdwn","text":"*Program:* Fibonacci  ·  *Device:* CPU"}]},{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION$ETHREX_SECTION$P3_SECTION"']}'
diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md
index a78249f31..60d44792a 100644
--- a/bench_vs_plonky3/README.md
+++ b/bench_vs_plonky3/README.md
@@ -81,22 +81,25 @@ Stdout (without `--report-dir`):
 
 [build] prove_bench
 --- log-rows=19  (rows = 524288) ---
-  [lambda] median 2.444s from 3 runs: 2.444,2.279,2.830
-  [p3]     median 0.988s from 3 runs: 0.981,0.993,0.988
+  [lambda] prove median 0.574s (CV 3.07%), verify 0.024s, proof 4116000 B, rss 805000 KB
+  [p3]     prove median 0.324s (CV 2.85%), verify 0.019s, proof 1987000 B, rss 627000 KB
 
 === Summary ===
-  log-rows   rows              Lambda (s)          P3 (s)        L/P3
-  --------   ----              ----------          ------        ----
-  19         524288                2.444s          0.988s      2.474x  (P3 faster)
+  log-rows   rows              Lambda (s)      L CV%          P3 (s)     P3 CV%        L/P3
+  --------   ----              ----------      -----          ------     ------        ----
+  19         524288              0.574s         3.07%        0.324s       2.85%      1.770x  (P3 faster)
 
-Timing window: single-shot end-to-end prove.
-Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster.
+Timing window: prove only for the ratio. Verify, proof size, RSS and throughput are reported separately.
 ```
 
 With `--report-dir DIR` the script writes:
 
-- `results.tsv` — tab-separated raw data (`log_rows, rows, lambda_median_s,
-  p3_median_s, ratio_lambda_over_p3, runs`).
+- `results.tsv` — tab-separated, one row per `log_rows` size with 14 columns:
+  `log_rows, rows, lambda_prove_median_s, lambda_prove_cv_pct,
+  lambda_verify_median_s, lambda_proof_size_bytes_median,
+  lambda_peak_rss_kb_median, p3_prove_median_s, p3_prove_cv_pct,
+  p3_verify_median_s, p3_proof_size_bytes_median, p3_peak_rss_kb_median,
+  ratio_lambda_over_p3, runs`.
 - `raw_metrics.tsv` — one row per `(prover, log_rows, run)` with all
   `METRICS` fields parsed out.
 - `raw_audits.tsv` — one row per `(prover, log_rows, run)` with the AUDIT
@@ -114,22 +117,27 @@ downstream tooling.
 
 ## Nightly
 
-A GitHub Actions workflow (`.github/workflows/bench-vs-p3-nightly.yml`) runs
-daily at 07:30 UTC (04:30 Buenos Aires, after the SP1 nightly completes) on
-the self-hosted `bench` runner. It executes:
+The Lambda-vs-Plonky3 bench is part of the shared
+`.github/workflows/bench-vs-nightly.yml` workflow, which runs daily at
+06:00 UTC (03:00 Buenos Aires) on the self-hosted `bench` runner. The P3
+step executes after the Lambda-vs-SP1 and ethrex empty-block steps:
 
 ```bash
 bash ./bench_vs_plonky3/run.sh \
-  --log-rows 19 \
+  --log-rows 21 \
   --num-sequences 16 \
-  --runs 3 \
+  --runs 10 \
   --scalar \
-  --report-dir bench_vs_p3_artifacts \
+  --report-dir bench_vs_artifacts/p3 \
   --no-color
 ```
 
-The `bench_vs_p3_artifacts/` directory is uploaded as an artifact named
-`bench-vs-p3-nightly-<run_number>-<sha>` with 90-day retention.
+A `cargo update -p p3-*` runs before this step so the bench tracks the
+latest upstream Plonky3 `main`. The full `bench_vs_artifacts/` directory
+(SP1 + ethrex + P3 outputs) is uploaded as one artifact named
+`bench-vs-nightly-<run_number>-<sha>` with 90-day retention. A "Lambda
+VM vs Plonky3" section is appended to the same Slack post that publishes
+the SP1 and ethrex results.
 
 ## Breakdown (per-phase timing) for manual analysis
 
diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh
index 20f1f6331..a4ce67fc2 100755
--- a/bench_vs_plonky3/run.sh
+++ b/bench_vs_plonky3/run.sh
@@ -242,10 +242,6 @@ ratio_fmt() {
     }'
 }
 
-mean_file() {
-    LC_NUMERIC=C awk '{ s += $1; n++ } END { if (n == 0) print "n/a"; else printf "%.6f\n", s / n }' "$1"
-}
-
 median_file() {
     LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk '
         { a[NR] = $0 + 0 }
@@ -256,18 +252,6 @@ median_file() {
         }'
 }
 
-stddev_file() {
-    LC_NUMERIC=C awk '
-        { s += $1; ss += $1 * $1; n++ }
-        END {
-            if (n == 0) { print "n/a"; exit }
-            m = s / n
-            v = (ss / n) - (m * m)
-            if (v < 0) v = 0
-            printf "%.6f\n", sqrt(v)
-        }' "$1"
-}
-
 cv_pct_file() {
     LC_NUMERIC=C awk '
         { s += $1; ss += $1 * $1; n++ }
@@ -282,14 +266,6 @@ cv_pct_file() {
         }' "$1"
 }
 
-min_file() {
-    LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk 'NR == 1 { printf "%.6f\n", $1; exit }'
-}
-
-max_file() {
-    LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk '{ x = $1 } END { if (NR == 0) print "n/a"; else printf "%.6f\n", x }'
-}
-
 fmt0() {
     LC_NUMERIC=C awk -v v="$1" 'BEGIN { if (v == "n/a") print v; else printf "%.0f\n", v }'
 }
@@ -354,8 +330,8 @@ run_prover() {
             run_args+=(--breakdown)
         fi
         if ! "$BIN" "${run_args[@]}" > "$out_file" 2>&1; then
-            echo -e "  ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}"
-            cat "$out_file"
+            echo -e "  ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}" >&2
+            cat "$out_file" >&2
             exit 1
         fi
         local audit_line
@@ -366,8 +342,8 @@ run_prover() {
         local metrics_line
         metrics_line=$(extract_metrics_line < "$out_file")
         if [ -z "$metrics_line" ]; then
-            echo -e "  ${RED}[${prover}] could not parse metrics (log-rows=${log_rows}, run ${run_i})${NC}"
-            cat "$out_file"
+            echo -e "  ${RED}[${prover}] could not parse metrics (log-rows=${log_rows}, run ${run_i})${NC}" >&2
+            cat "$out_file" >&2
             exit 1
         fi
         printf '%s\n' "$metrics_line" >> "$metrics_file"
diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs
index b71b2ce4e..c132f57a5 100644
--- a/bench_vs_plonky3/src/bin/prove_bench.rs
+++ b/bench_vs_plonky3/src/bin/prove_bench.rs
@@ -12,9 +12,9 @@
 //!   log-rows=19, num-sequences=16, blowup=2, queries=219, grinding=0.
 
 use std::process::ExitCode;
-use std::sync::{Arc, Mutex};
 use std::time::Instant;
 
+use bench_vs_plonky3::span_timing::{P3TimingLayer, SpanResults as P3SpanResults};
 use bench_vs_plonky3::{lambda_fibonacci_pair, plonky3_config, plonky3_fibonacci};
 use crypto::fiat_shamir::default_transcript::DefaultTranscript;
 use math::field::element::FieldElement;
@@ -327,74 +327,8 @@ fn emit_lambda_breakdown(args: &Args, rows: usize, total_ms: f64) {
     eprintln!("warning: Lambda phase breakdown requires building with --features instruments");
 }
 
-struct SpanState {
-    name: String,
-    active_since: Option<Instant>,
-    accumulated: std::time::Duration,
-}
-
-struct P3TimingLayer {
-    spans: Mutex<std::collections::HashMap<u64, SpanState>>,
-    results: Arc<Mutex<Vec<(String, f64)>>>,
-}
-
-impl<S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>>
-    tracing_subscriber::Layer<S> for P3TimingLayer
-{
-    fn on_new_span(
-        &self,
-        attrs: &tracing::span::Attributes<'_>,
-        id: &tracing::span::Id,
-        _ctx: tracing_subscriber::layer::Context<'_, S>,
-    ) {
-        self.spans.lock().unwrap().insert(
-            id.into_u64(),
-            SpanState {
-                name: attrs.metadata().name().to_string(),
-                active_since: None,
-                accumulated: std::time::Duration::ZERO,
-            },
-        );
-    }
-
-    fn on_enter(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
-        if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
-            && entry.active_since.is_none()
-        {
-            entry.active_since = Some(Instant::now());
-        }
-    }
-
-    fn on_exit(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
-        if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
-            && let Some(start) = entry.active_since.take()
-        {
-            entry.accumulated += start.elapsed();
-        }
-    }
-
-    fn on_close(&self, id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
-        if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) {
-            let mut total = entry.accumulated;
-            if let Some(start) = entry.active_since {
-                total += start.elapsed();
-            }
-            self.results
-                .lock()
-                .unwrap()
-                .push((entry.name, ms(total.as_secs_f64())));
-        }
-    }
-}
-
-type P3SpanResults = Arc<Mutex<Vec<(String, f64)>>>;
-
 fn p3_span_subscriber() -> (impl tracing::Subscriber + Send + Sync, P3SpanResults) {
-    let results = Arc::new(Mutex::new(Vec::new()));
-    let layer = P3TimingLayer {
-        spans: Mutex::new(std::collections::HashMap::new()),
-        results: Arc::clone(&results),
-    };
+    let (layer, results) = P3TimingLayer::new();
     let filter = tracing_subscriber::filter::LevelFilter::DEBUG;
     (
         tracing_subscriber::registry().with(filter).with(layer),
@@ -411,13 +345,17 @@ fn peak_rss_kb() -> Option<u64> {
     }
 
     let maxrss = unsafe { usage.assume_init().ru_maxrss };
+    if maxrss < 0 {
+        return None;
+    }
+    let maxrss = maxrss as u64;
     #[cfg(target_os = "macos")]
     {
-        Some((maxrss as u64).div_ceil(1024))
+        Some(maxrss.div_ceil(1024))
     }
     #[cfg(not(target_os = "macos"))]
     {
-        Some(maxrss as u64)
+        Some(maxrss)
     }
 }
 
diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
index 60596bddc..bae1235dc 100644
--- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
+++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs
@@ -271,6 +271,13 @@ where
         _bus_public_inputs: Option<&stark::lookup::BusPublicInputs<Self::FieldExtension>>,
         _trace_length: usize,
     ) -> BoundaryConstraints<Self::FieldExtension> {
+        assert_eq!(
+            pub_inputs.initial_values.len(),
+            self.num_sequences,
+            "AIR built for {} sequences, public inputs carry {}",
+            self.num_sequences,
+            pub_inputs.initial_values.len(),
+        );
         let mut constraints = Vec::with_capacity(2 * pub_inputs.initial_values.len());
         for (seq_idx, (a, b)) in pub_inputs.initial_values.iter().enumerate() {
             constraints.push(BoundaryConstraint::new_main(
diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs
index 7c722153e..dd5cbf675 100644
--- a/bench_vs_plonky3/src/lib.rs
+++ b/bench_vs_plonky3/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod lambda_fibonacci_pair;
 pub mod plonky3_config;
 pub mod plonky3_fibonacci;
+pub mod span_timing;
 
 #[cfg(test)]
 mod tests {
@@ -206,93 +207,9 @@ mod tests {
         println!("\n============================================================");
         println!("Plonky3 STARK Span Breakdown");
 
-        use std::collections::HashMap;
-        use std::sync::{Arc, Mutex};
         use tracing_subscriber::layer::SubscriberExt;
 
-        type SpanResults = Arc<Mutex<Vec<(String, f64)>>>;
-
-        struct SpanState {
-            name: String,
-            active_since: Option<std::time::Instant>,
-            accumulated: std::time::Duration,
-        }
-
-        struct P3TimingLayer {
-            spans: Mutex<HashMap<u64, SpanState>>,
-            results: SpanResults,
-        }
-
-        impl<
-            S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>,
-        > tracing_subscriber::Layer<S> for P3TimingLayer
-        {
-            fn on_new_span(
-                &self,
-                attrs: &tracing::span::Attributes<'_>,
-                id: &tracing::span::Id,
-                _ctx: tracing_subscriber::layer::Context<'_, S>,
-            ) {
-                let name = attrs.metadata().name().to_string();
-                self.spans.lock().unwrap().insert(
-                    id.into_u64(),
-                    SpanState {
-                        name,
-                        active_since: None,
-                        accumulated: std::time::Duration::ZERO,
-                    },
-                );
-            }
-
-            // Rayon can re-enter a span across threads, so only start timing on
-            // the first enter after each exit; accumulate every interval.
-            fn on_enter(
-                &self,
-                id: &tracing::span::Id,
-                _ctx: tracing_subscriber::layer::Context<'_, S>,
-            ) {
-                if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
-                    && entry.active_since.is_none()
-                {
-                    entry.active_since = Some(std::time::Instant::now());
-                }
-            }
-
-            fn on_exit(
-                &self,
-                id: &tracing::span::Id,
-                _ctx: tracing_subscriber::layer::Context<'_, S>,
-            ) {
-                if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
-                    && let Some(start) = entry.active_since.take()
-                {
-                    entry.accumulated += start.elapsed();
-                }
-            }
-
-            fn on_close(
-                &self,
-                id: tracing::span::Id,
-                _ctx: tracing_subscriber::layer::Context<'_, S>,
-            ) {
-                if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) {
-                    // If we never saw on_exit (span closed while active), include
-                    // the dangling interval.
-                    let mut total = entry.accumulated;
-                    if let Some(start) = entry.active_since {
-                        total += start.elapsed();
-                    }
-                    let ms = total.as_secs_f64() * 1000.0;
-                    self.results.lock().unwrap().push((entry.name, ms));
-                }
-            }
-        }
-
-        let results: SpanResults = Arc::new(Mutex::new(Vec::new()));
-        let layer = P3TimingLayer {
-            spans: Mutex::new(HashMap::new()),
-            results: Arc::clone(&results),
-        };
+        let (layer, results) = crate::span_timing::P3TimingLayer::new();
         let filter = tracing_subscriber::filter::LevelFilter::DEBUG;
         let subscriber = tracing_subscriber::registry().with(filter).with(layer);
 

From cd41dd7702759a95f1b573c7ab855d71940132df Mon Sep 17 00:00:00 2001
From: jotabulacios <jbulacios@fi.uba.ar>
Date: Wed, 20 May 2026 12:06:38 -0300
Subject: [PATCH 34/34] add missing file

---
 bench_vs_plonky3/src/span_timing.rs | 83 +++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 bench_vs_plonky3/src/span_timing.rs

diff --git a/bench_vs_plonky3/src/span_timing.rs b/bench_vs_plonky3/src/span_timing.rs
new file mode 100644
index 000000000..4d37423fb
--- /dev/null
+++ b/bench_vs_plonky3/src/span_timing.rs
@@ -0,0 +1,83 @@
+//! Tracing layer that accumulates per-span wall-clock durations from
+//! Plonky3's `tracing` instrumentation. Used by `prove_bench --breakdown`
+//! and by the `instruments_breakdown` test.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use tracing_subscriber::Layer;
+
+pub type SpanResults = Arc<Mutex<Vec<(String, f64)>>>;
+
+struct SpanState {
+    name: String,
+    active_since: Option<Instant>,
+    accumulated: Duration,
+}
+
+pub struct P3TimingLayer {
+    spans: Mutex<HashMap<u64, SpanState>>,
+    results: SpanResults,
+}
+
+impl P3TimingLayer {
+    pub fn new() -> (Self, SpanResults) {
+        let results: SpanResults = Arc::new(Mutex::new(Vec::new()));
+        let layer = Self {
+            spans: Mutex::new(HashMap::new()),
+            results: Arc::clone(&results),
+        };
+        (layer, results)
+    }
+}
+
+impl<S> Layer<S> for P3TimingLayer
+where
+    S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>,
+{
+    fn on_new_span(
+        &self,
+        attrs: &tracing::span::Attributes<'_>,
+        id: &tracing::span::Id,
+        _ctx: tracing_subscriber::layer::Context<'_, S>,
+    ) {
+        self.spans.lock().unwrap().insert(
+            id.into_u64(),
+            SpanState {
+                name: attrs.metadata().name().to_string(),
+                active_since: None,
+                accumulated: Duration::ZERO,
+            },
+        );
+    }
+
+    fn on_enter(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
+        if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
+            && entry.active_since.is_none()
+        {
+            entry.active_since = Some(Instant::now());
+        }
+    }
+
+    fn on_exit(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
+        if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64())
+            && let Some(start) = entry.active_since.take()
+        {
+            entry.accumulated += start.elapsed();
+        }
+    }
+
+    fn on_close(&self, id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) {
+        if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) {
+            let mut total = entry.accumulated;
+            if let Some(start) = entry.active_since {
+                total += start.elapsed();
+            }
+            self.results
+                .lock()
+                .unwrap()
+                .push((entry.name, total.as_secs_f64() * 1000.0));
+        }
+    }
+}