From f7f842bba7b9e96d58398781480d2ca11abc46bf Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 25 Feb 2026 19:00:35 -0300 Subject: [PATCH 01/34] bench vs others --- .gitignore | 1 + bench_vs/README.md | 59 + bench_vs/run.sh | 195 + bench_vs/sp1/fibonacci/.tldr/daemon.pid | 1 + bench_vs/sp1/fibonacci/.tldr/status | 1 + bench_vs/sp1/fibonacci/.tldrignore | 84 + bench_vs/sp1/fibonacci/Cargo.lock | 6182 ++++++++++++++++++++ bench_vs/sp1/fibonacci/Cargo.toml | 3 + bench_vs/sp1/fibonacci/program/Cargo.toml | 7 + bench_vs/sp1/fibonacci/program/src/main.rs | 14 + bench_vs/sp1/fibonacci/rust-toolchain | 3 + bench_vs/sp1/fibonacci/script/Cargo.toml | 10 + bench_vs/sp1/fibonacci/script/build.rs | 5 + bench_vs/sp1/fibonacci/script/src/main.rs | 47 + 14 files changed, 6612 insertions(+) create mode 100644 bench_vs/README.md create mode 100755 bench_vs/run.sh create mode 100644 bench_vs/sp1/fibonacci/.tldr/daemon.pid create mode 100644 bench_vs/sp1/fibonacci/.tldr/status create mode 100644 bench_vs/sp1/fibonacci/.tldrignore create mode 100644 bench_vs/sp1/fibonacci/Cargo.lock create mode 100644 bench_vs/sp1/fibonacci/Cargo.toml create mode 100644 bench_vs/sp1/fibonacci/program/Cargo.toml create mode 100644 bench_vs/sp1/fibonacci/program/src/main.rs create mode 100644 bench_vs/sp1/fibonacci/rust-toolchain create mode 100644 bench_vs/sp1/fibonacci/script/Cargo.toml create mode 100644 bench_vs/sp1/fibonacci/script/build.rs create mode 100644 bench_vs/sp1/fibonacci/script/src/main.rs diff --git a/.gitignore b/.gitignore index 9c826f0d9..3ef9f8283 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ executor/program_artifacts/ # Shared cargo target directory for ELF builds executor/shared_target/ + diff --git a/bench_vs/README.md b/bench_vs/README.md new file mode 100644 index 000000000..0a20304c3 --- /dev/null +++ b/bench_vs/README.md @@ -0,0 +1,59 @@ +# Lambda VM vs SP1 v6 Benchmark + +Compares proving time for an identical u64 wrapping Fibonacci computation. + +## Prerequisites + +1. **Lambda VM CLI** (built from this repo): + ```bash + cargo build --release -p cli + ``` + +2. **SP1 toolchain** (Succinct's prover): + ```bash + curl -L https://sp1up.succinct.xyz | bash + sp1up + ``` + +3. **RISC-V assembler** — Homebrew clang + ld.lld (macOS): + ```bash + brew install llvm + ``` + +## Usage + +```bash +# Default series: 1k, 10k, 100k, 300k iterations +./bench_vs/run.sh + +# Custom series +./bench_vs/run.sh -n 1000 50000 + +# Run only one prover +./bench_vs/run.sh --lambda-only +./bench_vs/run.sh --sp1-only +``` + +## What it measures + +Both provers execute the same program: iterative Fibonacci with `u64::wrapping_add`. +Only **proving time** is compared (wall-clock, no recursion/compression on either side). + +- **Lambda VM**: Generates RISC-V assembly at runtime, assembles to ELF, proves via the CLI. +- **SP1 v6**: Compiles a Rust guest program to RISC-V, proves via `sp1-sdk` core mode. + +## Output + +``` +=== Summary === +Program: Fibonacci (u64 wrapping) + + n Lambda VM SP1 v6 Ratio + --- --------- ------ ----- + 1000 13.3s 12.4s 0.9x + 10000 22.4s 12.9s 0.6x + 100000 116.4s 14.7s 0.1x + 300000 ... ... ... + +Green ratio = Lambda VM faster, Red = SP1 faster +``` diff --git a/bench_vs/run.sh b/bench_vs/run.sh new file mode 100755 index 000000000..1575e62a3 --- /dev/null +++ b/bench_vs/run.sh @@ -0,0 +1,195 @@ +#!/bin/bash +# Benchmark: Lambda VM vs SP1 v6 — Fibonacci proving time comparison. +# +# Usage: ./bench_vs/run.sh [-n 1000 50000 100000] [--lambda-only | --sp1-only] +# +# Without -n, runs the default series: 1000 10000 100000 300000 +# With -n, runs the specified values (space-separated): -n 1000 50000 +# +# Prerequisites: +# - Lambda VM CLI built: cargo build --release -p cli +# - SP1 toolchain installed: curl -L https://sp1up.succinct.xyz | bash && sp1up +# - clang with RISC-V target support (macOS Homebrew clang works) +# - ld.lld linker + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="/tmp/bench_fib" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BOLD='\033[1m' +NC='\033[0m' + +# --- Defaults ---------------------------------------------------------------- +DEFAULT_SERIES=(1000 10000 100000 300000) +SERIES=() +RUN_LAMBDA=true +RUN_SP1=true + +# --- Parse args -------------------------------------------------------------- +while [[ $# -gt 0 ]]; do + case $1 in + -n) shift + while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do + SERIES+=("$1"); shift + done ;; + --lambda-only) RUN_SP1=false; shift ;; + --sp1-only) RUN_LAMBDA=false; shift ;; + -h|--help) + echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only]" + echo "" + echo " -n N1 N2 ... Fibonacci iteration counts (space-separated)" + echo " Default series: ${DEFAULT_SERIES[*]}" + echo " --lambda-only Only run Lambda VM benchmark" + echo " --sp1-only Only run SP1 benchmark" + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +if [ ${#SERIES[@]} -eq 0 ]; then + SERIES=("${DEFAULT_SERIES[@]}") +fi + +echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}" +echo -e "Series: ${YELLOW}${SERIES[*]}${NC}" +echo "" + +rm -rf "$TMP_DIR" && mkdir -p "$TMP_DIR" + +# --- Pre-build --------------------------------------------------------------- + +CLI="$ROOT_DIR/target/release/cli" +if $RUN_LAMBDA && [ ! -f "$CLI" ]; then + echo -e "${YELLOW}[Lambda VM] CLI not found, building...${NC}" + cargo build --release -p cli --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1 +fi + +SP1_BIN="" +if $RUN_SP1; then + SP1_DIR="$SCRIPT_DIR/sp1/fibonacci" + echo -e "${GREEN}[SP1 v6] Building fibonacci prover...${NC}" + (cd "$SP1_DIR" && cargo build --release 2>&1 | tail -5) + SP1_BIN="$SP1_DIR/target/release/fibonacci-script" + if [ ! -f "$SP1_BIN" ]; then + echo -e "${RED}[SP1 v6] Build failed — fibonacci-script binary not found${NC}" + exit 1 + fi +fi + +# --- Run one benchmark -------------------------------------------------------- + +# Arrays to collect results for the summary table +declare -a RESULT_N RESULT_LAMBDA RESULT_SP1 + +run_one() { + local N=$1 + echo "" + echo -e "${BOLD}--- n=${N} ---${NC}" + + local lambda_time="" + local sp1_time="" + local sp1_cycles="" + + if $RUN_LAMBDA; then + # Generate assembly + cat > "$TMP_DIR/fib.s" </dev/null) + lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') + echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC}" + fi + + if $RUN_SP1; then + echo -e " ${GREEN}[SP1 v6] Proving...${NC}" + SP1_OUTPUT=$("$SP1_BIN" "$N" 2>/dev/null) + sp1_time=$(echo "$SP1_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') + sp1_cycles=$(echo "$SP1_OUTPUT" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*') + echo -e " SP1 v6: ${BOLD}${sp1_time}s${NC} (${sp1_cycles} cycles)" + fi + + RESULT_N+=("$N") + RESULT_LAMBDA+=("${lambda_time:-n/a}") + RESULT_SP1+=("${sp1_time:-n/a}") +} + +# --- Run series --------------------------------------------------------------- + +for N in "${SERIES[@]}"; do + run_one "$N" +done + +# --- Summary table ------------------------------------------------------------ + +echo "" +echo -e "${BOLD}=== Summary ===${NC}" +echo -e "Program: Fibonacci (u64 wrapping)" +echo "" + +# Header +if $RUN_LAMBDA && $RUN_SP1; then + printf " %-10s %12s %12s %8s\n" "n" "Lambda VM" "SP1 v6" "Ratio" + printf " %-10s %12s %12s %8s\n" "---" "---------" "------" "-----" +elif $RUN_LAMBDA; then + printf " %-10s %12s\n" "n" "Lambda VM" + printf " %-10s %12s\n" "---" "---------" +else + printf " %-10s %12s\n" "n" "SP1 v6" + printf " %-10s %12s\n" "---" "------" +fi + +for i in "${!RESULT_N[@]}"; do + n="${RESULT_N[$i]}" + lt="${RESULT_LAMBDA[$i]}" + st="${RESULT_SP1[$i]}" + + if $RUN_LAMBDA && $RUN_SP1; then + if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then + RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $st / $lt}") + if (( $(LC_NUMERIC=C awk "BEGIN {print ($st > $lt)}") )); then + RATIO="${GREEN}${RATIO}${NC}" + else + RATIO="${RED}${RATIO}${NC}" + fi + printf " %-10s %11ss %11ss " "$n" "$lt" "$st" + echo -e "$RATIO" + else + printf " %-10s %12s %12s %8s\n" "$n" "${lt}s" "${st}s" "-" + fi + elif $RUN_LAMBDA; then + printf " %-10s %11ss\n" "$n" "$lt" + else + printf " %-10s %11ss\n" "$n" "$st" + fi +done + +echo "" +echo -e "Green ratio = Lambda VM faster, Red = SP1 faster" +echo "Raw data in $TMP_DIR/" diff --git a/bench_vs/sp1/fibonacci/.tldr/daemon.pid b/bench_vs/sp1/fibonacci/.tldr/daemon.pid new file mode 100644 index 000000000..10eda36c4 --- /dev/null +++ b/bench_vs/sp1/fibonacci/.tldr/daemon.pid @@ -0,0 +1 @@ +39495 \ No newline at end of file diff --git a/bench_vs/sp1/fibonacci/.tldr/status b/bench_vs/sp1/fibonacci/.tldr/status new file mode 100644 index 000000000..ad50b5340 --- /dev/null +++ b/bench_vs/sp1/fibonacci/.tldr/status @@ -0,0 +1 @@ +ready \ No newline at end of file diff --git a/bench_vs/sp1/fibonacci/.tldrignore b/bench_vs/sp1/fibonacci/.tldrignore new file mode 100644 index 000000000..e01df83cb --- /dev/null +++ b/bench_vs/sp1/fibonacci/.tldrignore @@ -0,0 +1,84 @@ +# TLDR ignore patterns (gitignore syntax) +# Auto-generated - review and customize for your project +# Docs: https://git-scm.com/docs/gitignore + +# =================== +# Dependencies +# =================== +node_modules/ +.venv/ +venv/ +env/ +__pycache__/ +.tox/ +.nox/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +vendor/ +Pods/ + +# =================== +# Build outputs +# =================== +dist/ +build/ +out/ +target/ +*.egg-info/ +*.whl +*.pyc +*.pyo + +# =================== +# Binary/large files +# =================== +*.so +*.dylib +*.dll +*.exe +*.bin +*.o +*.a +*.lib + +# =================== +# IDE/editors +# =================== +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# =================== +# Security (always exclude) +# =================== +.env +.env.* +*.pem +*.key +*.p12 +*.pfx +credentials.* +secrets.* + +# =================== +# Version control +# =================== +.git/ +.hg/ +.svn/ + +# =================== +# OS files +# =================== +.DS_Store +Thumbs.db + +# =================== +# Project-specific +# Add your custom patterns below +# =================== +# large_test_fixtures/ +# data/ diff --git a/bench_vs/sp1/fibonacci/Cargo.lock b/bench_vs/sp1/fibonacci/Cargo.lock new file mode 100644 index 000000000..8825cad2e --- /dev/null +++ b/bench_vs/sp1/fibonacci/Cargo.lock @@ -0,0 +1,6182 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addchain" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2e69442aa5628ea6951fa33e24efe8313f4321a91bd729fc2f75bdfc858570" +dependencies = [ + "num-bigint 0.3.3", + "num-integer", + "num-traits", +] + +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "ark-ff" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec847af850f44ad29048935519032c33da8aa03340876d351dfab5660d2966ba" +dependencies = [ + "ark-ff-asm", + "ark-ff-macros", + "ark-serialize", + "ark-std", + "derivative", + "digest", + "itertools 0.10.5", + "num-bigint 0.4.6", + "num-traits", + "paste", + "rustc_version", + "zeroize", +] + +[[package]] +name = "ark-ff-asm" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed4aa4fe255d0bc6d79373f7e31d2ea147bcf486cba1be5ba7ea85abdb92348" +dependencies = [ + "quote", + "syn 1.0.109", +] + +[[package]] +name = "ark-ff-macros" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7abe79b0e4288889c4574159ab790824d0033b9fdcb2a112a3182fac2e514565" +dependencies = [ + "num-bigint 0.4.6", + "num-traits", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "ark-serialize" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb7b85a02b83d2f22f89bd5cac66c9c89474240cb6207cb1efc16d098e822a5" +dependencies = [ + "ark-std", + "digest", + "num-bigint 0.4.6", +] + +[[package]] +name = "ark-std" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94893f1e0c6eeab764ade8dc4c0db24caf4fe7cbbaafc0eba0a9030f447b5185" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "async-scoped" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4042078ea593edffc452eef14e99fdb2b120caa4ad9618bcdeabc4a023b98740" +dependencies = [ + "futures", + "pin-project", + "tokio", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "atomic" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower 0.5.3", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "serde", + "windows-link", +] + +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "bindgen" +version = "0.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 2.0.117", +] + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake2b_simd" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b79834656f71332577234b50bfc009996f7449e0c056884e6a02492ded0ca2f3" +dependencies = [ + "arrayref", + "arrayvec", + "constant_time_eq", +] + +[[package]] +name = "blake3" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array 0.14.9", +] + +[[package]] +name = "bls12_381" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3c196a77437e7cc2fb515ce413a6401291578b5afc8ecb29a3c7ab957f05941" +dependencies = [ + "ff 0.12.1", + "group 0.12.1", + "pairing", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "byte-slice-cast" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7575182f7272186991736b70173b0ea045398f984bf5ebbb3804736ce1330c9d" + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "camino" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48" +dependencies = [ + "serde_core", +] + +[[package]] +name = "cargo-platform" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d886547e41f740c616ae73108f6eb70afe6d940c7bc697cb30f13daec073037" +dependencies = [ + "camino", + "cargo-platform", + "semver", + "serde", + "serde_json", + "thiserror 1.0.69", +] + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "num-traits", + "windows-link", +] + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width 0.2.2", + "windows-sys 0.59.0", +] + +[[package]] +name = "const-default" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b396d1f76d455557e1218ec8066ae14bba60b4b36ecd55577ba979f5db7ecaa" + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "const_format" +version = "0.2.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7faa7469a93a566e9ccc1c73fe783b4a65c274c5ace346038dca9c39fe0030ad" +dependencies = [ + "const_format_proc_macros", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d57c2eccfb16dbac1f4e61e206105db5820c9d26c3c472bc17c774259ef7744" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array 0.14.9", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array 0.14.9", + "typenum", +] + +[[package]] +name = "dashu" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b3e5ac1e23ff1995ef05b912e2b012a8784506987a2651552db2c73fb3d7e0" +dependencies = [ + "dashu-base", + "dashu-float", + "dashu-int", + "dashu-macros", + "dashu-ratio", + "rustversion", +] + +[[package]] +name = "dashu-base" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b80bf6b85aa68c58ffea2ddb040109943049ce3fbdf4385d0380aef08ef289" + +[[package]] +name = "dashu-float" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85078445a8dbd2e1bd21f04a816f352db8d333643f0c9b78ca7c3d1df71063e7" +dependencies = [ + "dashu-base", + "dashu-int", + "num-modular", + "num-order", + "rustversion", + "static_assertions", +] + +[[package]] +name = "dashu-int" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee99d08031ca34a4d044efbbb21dff9b8c54bb9d8c82a189187c0651ffdb9fbf" +dependencies = [ + "cfg-if", + "dashu-base", + "num-modular", + "num-order", + "rustversion", + "static_assertions", +] + +[[package]] +name = "dashu-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93381c3ef6366766f6e9ed9cf09e4ef9dec69499baf04f0c60e70d653cf0ab10" +dependencies = [ + "dashu-base", + "dashu-float", + "dashu-int", + "dashu-ratio", + "paste", + "proc-macro2", + "quote", + "rustversion", +] + +[[package]] +name = "dashu-ratio" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e33b04dd7ce1ccf8a02a69d3419e354f2bbfdf4eb911a0b7465487248764c9" +dependencies = [ + "dashu-base", + "dashu-float", + "dashu-int", + "num-modular", + "num-order", + "rustversion", +] + +[[package]] +name = "deepsize2" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b5184084af9beed35eecbf4c36baf6e26b9dc47b61b74e02f930c72a58e71b" +dependencies = [ + "deepsize_derive2", + "hashbrown 0.14.5", +] + +[[package]] +name = "deepsize_derive2" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0f8817865cacf3b93b943ca06b0fc5fd8e99eabfdb7ea5d296efcbc4afc4f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive-where" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef941ded77d15ca19b40374869ac6000af1c9f2a4c0f3d4c70926287e6364a8f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "downcast-rs" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" + +[[package]] +name = "downloader" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac1e888d6830712d565b2f3a974be3200be9296bc1b03db8251a4cbf18a4a34" +dependencies = [ + "digest", + "futures", + "rand 0.8.5", + "reqwest", + "thiserror 1.0.69", + "tokio", +] + +[[package]] +name = "dynasm" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7d4c414c94bc830797115b8e5f434d58e7e80cb42ba88508c14bc6ea270625" +dependencies = [ + "bitflags", + "byteorder", + "lazy_static", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "dynasmrt" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "602f7458a3859195fb840e6e0cce5f4330dd9dfbfece0edaf31fe427af346f55" +dependencies = [ + "byteorder", + "dynasm", + "fnv", + "memmap2", +] + +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der", + "digest", + "elliptic-curve", + "rfc6979", + "serdect", + "signature", + "spki", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +dependencies = [ + "serde", +] + +[[package]] +name = "elf" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4445909572dbd556c457c849c4ca58623d84b27c8fff1e74b0b4227d8b90d17b" + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct", + "crypto-bigint", + "digest", + "ff 0.13.1", + "generic-array 0.14.9", + "group 0.13.0", + "pem-rfc7468", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "serdect", + "subtle", + "zeroize", +] + +[[package]] +name = "embedded-alloc" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f2de9133f68db0d4627ad69db767726c99ff8585272716708227008d3f1bddd" +dependencies = [ + "const-default", + "critical-section", + "linked_list_allocator", + "rlsf", +] + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "enum-map" +version = "2.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9" +dependencies = [ + "enum-map-derive", + "serde", +] + +[[package]] +name = "enum-map-derive" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "eventsource-stream" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74fef4569247a5f429d9156b9d0a2599914385dd189c539334c625d8099d90ab" +dependencies = [ + "futures-core", + "nom", + "pin-project-lite", +] + +[[package]] +name = "eyre" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd915d99f24784cdc19fd37ef22b97e3ff0ae756c7e492e9fbfe897d61e2aec" +dependencies = [ + "indenter", + "once_cell", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "bitvec", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "bitvec", + "byteorder", + "ff_derive", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "ff_derive" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f10d12652036b0e99197587c6ba87a8fc3031986499973c030d8b44fcc151b60" +dependencies = [ + "addchain", + "num-bigint 0.3.3", + "num-integer", + "num-traits", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "fibonacci-program" +version = "0.1.0" +dependencies = [ + "sp1-zkvm", +] + +[[package]] +name = "fibonacci-script" +version = "0.1.0" +dependencies = [ + "sp1-build", + "sp1-sdk", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "gcd" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d758ba1b47b00caf47f24925c0074ecb20d6dfcffe7f6d53395c0465674841a" + +[[package]] +name = "gen_ops" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a" + +[[package]] +name = "generic-array" +version = "0.14.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" +dependencies = [ + "typenum", + "version_check", + "zeroize", +] + +[[package]] +name = "generic-array" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96512db27971c2c3eece70a1e106fbe6c87760234e31e8f7e5634912fe52794a" +dependencies = [ + "serde", + "typenum", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff 0.12.1", + "memuse", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff 0.13.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.13.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "halo2" +version = "0.1.0-beta.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a23c779b38253fe1538102da44ad5bd5378495a61d2c4ee18d64eaa61ae5995" +dependencies = [ + "halo2_proofs", +] + +[[package]] +name = "halo2_proofs" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e925780549adee8364c7f2b685c753f6f3df23bde520c67416e93bf615933760" +dependencies = [ + "blake2b_simd", + "ff 0.12.1", + "group 0.12.1", + "pasta_curves 0.4.1", + "rand_core 0.6.4", + "rayon", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", + "serde", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2 0.6.2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core 0.62.2", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "impl-trait-for-tuples" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0eb5a3343abf848c0984fe4604b2b105da9539376e24fc0a3b0007411ae4fd9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "indenter" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "964de6e86d545b246d84badc0fef527924ace5134f30641c203ef52ba83f58d5" + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width 0.2.2", + "web-time", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "js-sys" +version = "0.3.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "jubjub" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a575df5f985fe1cd5b2b05664ff6accfc46559032b954529fd225a2168d27b0f" +dependencies = [ + "bitvec", + "bls12_381", + "ff 0.12.1", + "group 0.12.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "k256" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b" +dependencies = [ + "cfg-if", + "ecdsa", + "elliptic-curve", + "once_cell", + "serdect", + "sha2", + "signature", +] + +[[package]] +name = "keccak" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" +dependencies = [ + "cpufeatures", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "libredox" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" +dependencies = [ + "bitflags", + "libc", +] + +[[package]] +name = "linked_list_allocator" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afa463f5405ee81cdb9cc2baf37e08ec7e4c8209442b5d72c04cfb2cd6e6286" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memfd" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad38eb12aea514a0466ea40a80fd8cc83637065948eb4a426e4aa46261175227" +dependencies = [ + "rustix", +] + +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + +[[package]] +name = "memuse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d97bbf43eb4f088f8ca469930cde17fa036207c9a5e02ccc5107c4e8b17c964" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "mti" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9563a7d5556636e74bbd8773241fbcbc5c89b9f6bfdc97b29b56e740c2c74b9" +dependencies = [ + "typeid_prefix", + "typeid_suffix", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint 0.4.6", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6f7833f2cbf2360a6cfd58cd41a53aa7a90bd4c202f5b1c7dd2ed73c57b2c3" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-conv" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-modular" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17bb261bf36fa7d83f4c294f834e91256769097b3cb505d44831e0a179ac647f" + +[[package]] +name = "num-order" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "537b596b97c40fcf8056d153049eb22f481c17ebce72a513ec9286e4986d1bb6" +dependencies = [ + "num-modular", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint 0.4.6", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "num_enum" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9" +dependencies = [ + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799" +dependencies = [ + "proc-macro-crate 1.3.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "opentelemetry" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror 1.0.69", +] + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa", + "elliptic-curve", + "primeorder", + "sha2", +] + +[[package]] +name = "p3-air" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d275c27bb81483d669709d7244ce333b51f9743af2474cdc09ba1509f5c290db" +dependencies = [ + "p3-field", + "p3-matrix", + "serde", +] + +[[package]] +name = "p3-baby-bear" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a083928c9055f2171e3cb0bb4767969e4955473e71ba61affe46d7a3c98a89" +dependencies = [ + "num-bigint 0.4.6", + "p3-field", + "p3-mds", + "p3-poseidon2", + "p3-symmetric", + "rand 0.8.5", + "serde", +] + +[[package]] +name = "p3-bn254-fr" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9abf208fbfe540d6e2a6caaa2a9a345b1c8cb23ffdcdfcc6987244525d4fc821" +dependencies = [ + "ff 0.13.1", + "num-bigint 0.4.6", + "p3-field", + "p3-poseidon2", + "p3-symmetric", + "rand 0.8.5", + "serde", +] + +[[package]] +name = "p3-challenger" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42b725b453bbb35117a1abf0ddfd900b0676063d6e4231e0fa6bb0d76018d8ad" +dependencies = [ + "p3-field", + "p3-maybe-rayon", + "p3-symmetric", + "p3-util", + "serde", + "tracing", +] + +[[package]] +name = "p3-commit" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "518695b56f450f9223bdd8994dda87916b97ebf1d1c03c956807e78522fdb333" +dependencies = [ + "itertools 0.12.1", + "p3-challenger", + "p3-field", + "p3-matrix", + "p3-util", + "serde", +] + +[[package]] +name = "p3-dft" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56a1f81101bff744b7ebba7f4497e917a2c6716d6e62736e4a56e555a2d98cb7" +dependencies = [ + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-util", + "tracing", +] + +[[package]] +name = "p3-field" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36459d4acb03d08097d713f336c7393990bb489ab19920d4f68658c7a5c10968" +dependencies = [ + "itertools 0.12.1", + "num-bigint 0.4.6", + "num-traits", + "p3-util", + "rand 0.8.5", + "serde", +] + +[[package]] +name = "p3-fri" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2529a174a04189cfe705d756fb0e33d3c8fb06b167b521ddb877c78407f12a" +dependencies = [ + "itertools 0.12.1", + "p3-challenger", + "p3-commit", + "p3-dft", + "p3-field", + "p3-interpolation", + "p3-matrix", + "p3-maybe-rayon", + "p3-util", + "serde", + "tracing", +] + +[[package]] +name = "p3-interpolation" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6662049877c802155cdb4863db59899469fc3565d22d9047e1bd22d6b71f28e5" +dependencies = [ + "p3-field", + "p3-matrix", + "p3-util", +] + +[[package]] +name = "p3-keccak-air" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "169c96f8f0aaa9042872fdb6bbae0477fd1363b87c23877dbb2ec7fb46f8fcfa" +dependencies = [ + "p3-air", + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-util", + "tracing", +] + +[[package]] +name = "p3-koala-bear" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb1f52bcb6be38bdc8fa6b38b3434d4eedd511f361d4249fd798c6a5ef817b40" +dependencies = [ + "num-bigint 0.4.6", + "p3-field", + "p3-mds", + "p3-poseidon2", + "p3-symmetric", + "rand 0.8.5", + "serde", +] + +[[package]] +name = "p3-matrix" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5583e9cd136a4095a25c41a9edfdcce2dfae58ef01639317813bdbbd5b55c583" +dependencies = [ + "itertools 0.12.1", + "p3-field", + "p3-maybe-rayon", + "p3-util", + "rand 0.8.5", + "serde", + "tracing", +] + +[[package]] +name = "p3-maybe-rayon" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e524d47a49fb4265611303339c4ef970d892817b006cc330dad18afb91e411b1" +dependencies = [ + "rayon", +] + +[[package]] +name = "p3-mds" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f6cb8edcb276033d43769a3725570c340d2ed6f35c3cca4cddeee07718fa376" +dependencies = [ + "itertools 0.12.1", + "p3-dft", + "p3-field", + "p3-matrix", + "p3-symmetric", + "p3-util", + "rand 0.8.5", +] + +[[package]] +name = "p3-merkle-tree" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e8bc3c224fc70d22f9556393e1482b52539e11c7b82ac6933c436fd82738f4" +dependencies = [ + "itertools 0.12.1", + "p3-commit", + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-symmetric", + "p3-util", + "serde", + "tracing", +] + +[[package]] +name = "p3-poseidon2" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a26197df2097b98ab7038d59a01e1fe1a0f545e7e04aa9436b2454b1836654f" +dependencies = [ + "gcd", + "p3-field", + "p3-mds", + "p3-symmetric", + "rand 0.8.5", + "serde", +] + +[[package]] +name = "p3-symmetric" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a1d3b5202096bca57cde912fbbb9cbaedaf5ac7c42a924c7166b98709d64d21" +dependencies = [ + "itertools 0.12.1", + "p3-field", + "serde", +] + +[[package]] +name = "p3-uni-stark" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fef1cdb8285a7adb78df991852d3b66d3b25cf6ffc34f528505d1aee49bdb968" +dependencies = [ + "itertools 0.12.1", + "p3-air", + "p3-challenger", + "p3-commit", + "p3-dft", + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-util", + "serde", + "tracing", +] + +[[package]] +name = "p3-util" +version = "0.3.2-succinct" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec5f0388aa6d935ca3a17444086120f393f0b2f0816010b5ff95998c1c4095e3" +dependencies = [ + "serde", +] + +[[package]] +name = "pairing" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135590d8bdba2b31346f9cd1fb2a912329f5135e832a4f422942eb6ead8b6b3b" +dependencies = [ + "group 0.12.1", +] + +[[package]] +name = "parity-scale-codec" +version = "3.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799781ae679d79a948e13d4824a40970bfa500058d245760dd857301059810fa" +dependencies = [ + "arrayvec", + "byte-slice-cast", + "const_format", + "impl-trait-for-tuples", + "parity-scale-codec-derive", + "rustversion", +] + +[[package]] +name = "parity-scale-codec-derive" +version = "3.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34b4653168b563151153c9e4c08ebed57fb8262bebfa79711552fa983c623e7a" +dependencies = [ + "proc-macro-crate 3.4.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "pasta_curves" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc65faf8e7313b4b1fbaa9f7ca917a0eed499a9663be71477f87993604341d8" +dependencies = [ + "blake2b_simd", + "ff 0.12.1", + "group 0.12.1", + "lazy_static", + "rand 0.8.5", + "static_assertions", + "subtle", +] + +[[package]] +name = "pasta_curves" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e57598f73cc7e1b2ac63c79c517b31a0877cd7c402cdcaa311b5208de7a095" +dependencies = [ + "blake2b_simd", + "ff 0.13.1", + "group 0.13.0", + "lazy_static", + "rand 0.8.5", + "static_assertions", + "subtle", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap 2.13.0", +] + +[[package]] +name = "pin-project" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.117", +] + +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve", +] + +[[package]] +name = "proc-macro-crate" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" +dependencies = [ + "once_cell", + "toml_edit 0.19.15", +] + +[[package]] +name = "proc-macro-crate" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +dependencies = [ + "toml_edit 0.23.10+spec-1.0.0", +] + +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +dependencies = [ + "heck", + "itertools 0.14.0", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn 2.0.117", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "prost-types" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +dependencies = [ + "prost", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash 2.1.1", + "rustls", + "socket2 0.6.2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash 2.1.1", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2 0.6.2", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "range-set-blaze" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2" +dependencies = [ + "gen_ops", + "itertools 0.12.1", + "num-integer", + "num-traits", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "rayon-scan" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f87cc11a0140b4b0da0ffc889885760c61b13672d80a908920b2c0df078fa14" +dependencies = [ + "rayon", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror 1.0.69", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower 0.5.3", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rlsf" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1646a59a9734b8b7a0ac51689388a60fe1625d4b956348e9de07591a1478457a" +dependencies = [ + "cfg-if", + "const-default", + "libc", + "rustversion", + "svgbobdoc", +] + +[[package]] +name = "rrs-succinct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efd079cd303257a4cb4e5aadfa79a7fe23f3c8301aa4740ccc3a99673485a352" +dependencies = [ + "downcast-rs", + "num_enum", + "paste", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustc-hex" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e75f6a532d0fd9f7f13144f392b6ad56a32696bfcd9c78f797f16bbb6f072d6" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "scale-info" +version = "2.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "346a3b32eba2640d17a9cb5927056b08f3de90f65b72fe09402c2ad07d684d0b" +dependencies = [ + "cfg-if", + "derive_more", + "parity-scale-codec", + "scale-info-derive", +] + +[[package]] +name = "scale-info-derive" +version = "2.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6630024bf739e2179b91fb424b28898baf819414262c5d376677dbff1fe7ebf" +dependencies = [ + "proc-macro-crate 3.4.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "scc" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc" +dependencies = [ + "sdd", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sdd" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca" + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct", + "der", + "generic-array 0.14.9", + "pkcs8", + "serdect", + "subtle", + "zeroize", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +dependencies = [ + "serde", + "serde_core", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_arrays" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94a16b99c5ea4fe3daccd14853ad260ec00ea043b2708d1fd1da3106dcd8d9df" +dependencies = [ + "serde", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serdect" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a84f14a19e9a014bb9f4512488d9829a68e04ecabffb0f9904cd1ace94598177" +dependencies = [ + "base16ct", + "serde", +] + +[[package]] +name = "serial_test" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f" +dependencies = [ + "futures-executor", + "futures-util", + "log", + "once_cell", + "parking_lot", + "scc", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "sha1_smol" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha3" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" +dependencies = [ + "digest", + "keccak", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "slop-air" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c27279ff5aa6177ad08fd2bcde31f34fc98ea633666a835a4fad3502824dce26" +dependencies = [ + "p3-air", +] + +[[package]] +name = "slop-algebra" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1d38320f4622a9f07907b8529d031066a75a6e741ea2ef17ed1e16047f5bd77" +dependencies = [ + "itertools 0.14.0", + "p3-field", + "serde", +] + +[[package]] +name = "slop-alloc" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51cdc27df6c9fe163f68b724d2b00b4edd24e66bf38a06e7bc473e50e36c3799" +dependencies = [ + "serde", + "slop-algebra", + "thiserror 1.0.69", +] + +[[package]] +name = "slop-baby-bear" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3500e0ad37b85d0dfd792c59615abe9741fe519c78bdc3928dc3fbbab57c2b5b" +dependencies = [ + "lazy_static", + "p3-baby-bear", + "serde", + "slop-algebra", + "slop-challenger", + "slop-poseidon2", + "slop-symmetric", +] + +[[package]] +name = "slop-basefold" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc3d75bc5651b46f135ac04140fefa4a9a4143440edcccc1e9d8e4d3dd05715" +dependencies = [ + "derive-where", + "itertools 0.14.0", + "serde", + "slop-algebra", + "slop-alloc", + "slop-baby-bear", + "slop-bn254", + "slop-challenger", + "slop-koala-bear", + "slop-merkle-tree", + "slop-multilinear", + "slop-primitives", + "slop-tensor", + "slop-utils", + "thiserror 1.0.69", +] + +[[package]] +name = "slop-basefold-prover" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17834564e6d40554b7a635db4fb8cfd61a19f7cc3438549d53b40a4e9e157b1f" +dependencies = [ + "derive-where", + "itertools 0.14.0", + "rand 0.8.5", + "serde", + "slop-algebra", + "slop-alloc", + "slop-baby-bear", + "slop-basefold", + "slop-bn254", + "slop-challenger", + "slop-commit", + "slop-dft", + "slop-fri", + "slop-futures", + "slop-koala-bear", + "slop-merkle-tree", + "slop-multilinear", + "slop-tensor", + "thiserror 1.0.69", +] + +[[package]] +name = "slop-bn254" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91cb09414adf73264281cf490e2bd23be7d28415e4e729a275029ebc1a0acf6a" +dependencies = [ + "ff 0.13.1", + "p3-bn254-fr", + "serde", + "slop-algebra", + "slop-challenger", + "slop-poseidon2", + "slop-symmetric", + "zkhash", +] + +[[package]] +name = "slop-challenger" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "395ae2cad21ea894c614166f48dce58135be2aa13ab04971cbe6e31b85ad9902" +dependencies = [ + "futures", + "p3-challenger", + "serde", + "slop-algebra", + "slop-symmetric", +] + +[[package]] +name = "slop-commit" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b0ed38f216999ad9211f384f59d20ff0f70b88010a2856b7e0dde4d23b8cde8" +dependencies = [ + "p3-commit", + "serde", + "slop-alloc", +] + +[[package]] +name = "slop-dft" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9211d3c0ff3794563ffc7c3ffa3a5cde8becee5f6e831fb94552a607e320fd23" +dependencies = [ + "p3-dft", + "serde", + "slop-algebra", + "slop-alloc", + "slop-matrix", + "slop-tensor", +] + +[[package]] +name = "slop-fri" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c90e0689aa9b4f67700d6a100fd02e6e0f17de1eb806bb78e2074462b7b6201" +dependencies = [ + "p3-fri", +] + +[[package]] +name = "slop-futures" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e68c32cc3be82a37b69af3d1e4effb509839d9c2fab7457c41c3d50dd32a842e" +dependencies = [ + "crossbeam", + "futures", + "pin-project", + "rayon", + "thiserror 1.0.69", + "tokio", + "tracing", +] + +[[package]] +name = "slop-jagged" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bce3113032254921bef5e071216f35519ea08730a1f44f2ade4be7e0c305631a" +dependencies = [ + "derive-where", + "futures", + "itertools 0.14.0", + "num_cpus", + "rand 0.8.5", + "rayon", + "serde", + "slop-algebra", + "slop-alloc", + "slop-baby-bear", + "slop-basefold", + "slop-basefold-prover", + "slop-bn254", + "slop-challenger", + "slop-commit", + "slop-futures", + "slop-koala-bear", + "slop-merkle-tree", + "slop-multilinear", + "slop-stacked", + "slop-sumcheck", + "slop-symmetric", + "slop-tensor", + "slop-utils", + "thiserror 1.0.69", + "tracing", +] + +[[package]] +name = "slop-keccak-air" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bef24890c8a39c8caf484afa97060c41466455f9102283902ff68bbfd7f841" +dependencies = [ + "p3-keccak-air", +] + +[[package]] +name = "slop-koala-bear" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb1f80eb2a075f550c7e9abed16e03c727f54108f587a465d023ec810100a70f" +dependencies = [ + "lazy_static", + "p3-koala-bear", + "serde", + "slop-algebra", + "slop-challenger", + "slop-poseidon2", + "slop-symmetric", +] + +[[package]] +name = "slop-matrix" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba089c19d768cc452b511f754958254892caed33d7a8d744ffc67377111e4908" +dependencies = [ + "p3-matrix", +] + +[[package]] +name = "slop-maybe-rayon" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e968db301ffe72ca69fe7a8b61e0f5f8d3b22a12475c5c9b99141e60ad8956d" +dependencies = [ + "p3-maybe-rayon", +] + +[[package]] +name = "slop-merkle-tree" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f2721f11f0242bcc36e3cdc70cf31fdbb936b2731f1d059929b436fe002fa8" +dependencies = [ + "derive-where", + "ff 0.13.1", + "itertools 0.14.0", + "p3-merkle-tree", + "serde", + "slop-algebra", + "slop-alloc", + "slop-baby-bear", + "slop-bn254", + "slop-challenger", + "slop-commit", + "slop-futures", + "slop-koala-bear", + "slop-matrix", + "slop-poseidon2", + "slop-symmetric", + "slop-tensor", + "thiserror 1.0.69", + "zkhash", +] + +[[package]] +name = "slop-multilinear" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf5d26d5fc7751af8de644225f51c178e2af42e2b762496ba9a00fc65677617d" +dependencies = [ + "derive-where", + "futures", + "num_cpus", + "rand 0.8.5", + "rayon", + "serde", + "slop-algebra", + "slop-alloc", + "slop-challenger", + "slop-commit", + "slop-futures", + "slop-matrix", + "slop-tensor", +] + +[[package]] +name = "slop-poseidon2" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f26080f555f777867a68eb18fa34d7c321e9f0250ace86ef3f0cb0151157133" +dependencies = [ + "p3-poseidon2", +] + +[[package]] +name = "slop-primitives" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a606113e4aac9024483e283ab6ef7afc4ebd5d5ca0915b713f8d1d23aa1687bd" +dependencies = [ + "slop-algebra", +] + +[[package]] +name = "slop-stacked" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f807203f2d5505ab4b6f44a99d575aaf0d46a39b16af42397b310063667ee8" +dependencies = [ + "derive-where", + "futures", + "itertools 0.14.0", + "serde", + "slop-algebra", + "slop-alloc", + "slop-basefold", + "slop-basefold-prover", + "slop-challenger", + "slop-commit", + "slop-futures", + "slop-merkle-tree", + "slop-multilinear", + "slop-tensor", + "thiserror 1.0.69", +] + +[[package]] +name = "slop-sumcheck" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4045fc34ee3aef98a67baf650bc462327bbc95ca69e0265ba56f9b6cfc2515b" +dependencies = [ + "futures", + "itertools 0.14.0", + "rayon", + "serde", + "slop-algebra", + "slop-alloc", + "slop-baby-bear", + "slop-challenger", + "slop-multilinear", + "thiserror 1.0.69", +] + +[[package]] +name = "slop-symmetric" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1eb38a05aacd00d2362bb5f51c00f3e9cb82b7091d7b862ac239171d5a3dcad4" +dependencies = [ + "p3-symmetric", +] + +[[package]] +name = "slop-tensor" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba4b24bc6985c0215c459e723228c0da10a7b35541c8ccb1b533146d49df49f" +dependencies = [ + "arrayvec", + "derive-where", + "itertools 0.14.0", + "rand 0.8.5", + "rayon", + "serde", + "slop-algebra", + "slop-alloc", + "slop-futures", + "slop-matrix", + "thiserror 1.0.69", + "transpose", +] + +[[package]] +name = "slop-uni-stark" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f7e27e2c06b9504dbb5eb3cbee929b651f9da6ea5112dd4004ce1cc3b8e586" +dependencies = [ + "p3-uni-stark", +] + +[[package]] +name = "slop-utils" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b2e9bd1717e7848d44ce8f5d3eb92209c658a0e934b02af7a5dad4f70271a6" +dependencies = [ + "p3-util", + "tracing-forest", + "tracing-subscriber", +] + +[[package]] +name = "slop-whir" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1393446116ca30b7685a5ca9bb50bb9095b8074bdd63941a8d64b3c22cc14a8" +dependencies = [ + "derive-where", + "futures", + "itertools 0.14.0", + "rand 0.8.5", + "rayon", + "serde", + "slop-algebra", + "slop-alloc", + "slop-baby-bear", + "slop-basefold", + "slop-challenger", + "slop-commit", + "slop-dft", + "slop-jagged", + "slop-koala-bear", + "slop-matrix", + "slop-merkle-tree", + "slop-multilinear", + "slop-stacked", + "slop-tensor", + "slop-utils", + "thiserror 1.0.69", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +dependencies = [ + "serde", +] + +[[package]] +name = "snowbridge-amcl" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460a9ed63cdf03c1b9847e8a12a5f5ba19c4efd5869e4a737e05be25d7c427e5" +dependencies = [ + "parity-scale-codec", + "scale-info", +] + +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "socket2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "sp1-build" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c469c584f9a1f0f7a64283c94c074d1edb0446a2ff76a7f60f7e4ce804f4b2c" +dependencies = [ + "anyhow", + "cargo_metadata", + "chrono", + "clap", + "dirs", + "sp1-primitives", +] + +[[package]] +name = "sp1-core-executor" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c3d6a58470da2280a8bf14457721b1f560e4d0b8e67c1067e1ce78fdcc5fde3" +dependencies = [ + "bincode", + "bytemuck", + "cfg-if", + "clap", + "deepsize2", + "elf", + "enum-map", + "eyre", + "hashbrown 0.14.5", + "hex", + "itertools 0.14.0", + "memmap2", + "num", + "rrs-succinct", + "serde", + "serde_arrays", + "serde_json", + "slop-air", + "slop-algebra", + "slop-maybe-rayon", + "slop-symmetric", + "sp1-curves", + "sp1-hypercube", + "sp1-jit", + "sp1-primitives", + "strum", + "subenum", + "thiserror 1.0.69", + "tiny-keccak", + "tracing", + "typenum", + "vec_map", +] + +[[package]] +name = "sp1-core-machine" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7704a5542a77a0b98e483bd87256362658a91b7b70a6864c5c2c92fbbc7a5a71" +dependencies = [ + "bincode", + "cfg-if", + "enum-map", + "futures", + "generic-array 1.1.0", + "hashbrown 0.14.5", + "itertools 0.14.0", + "num", + "num_cpus", + "rayon", + "rayon-scan", + "rrs-succinct", + "serde", + "serde_json", + "slop-air", + "slop-algebra", + "slop-challenger", + "slop-futures", + "slop-keccak-air", + "slop-matrix", + "slop-maybe-rayon", + "slop-uni-stark", + "snowbridge-amcl", + "sp1-core-executor", + "sp1-curves", + "sp1-derive", + "sp1-hypercube", + "sp1-jit", + "sp1-primitives", + "static_assertions", + "strum", + "sysinfo", + "tempfile", + "thiserror 1.0.69", + "tokio", + "tracing", + "tracing-forest", + "tracing-subscriber", + "typenum", +] + +[[package]] +name = "sp1-cuda" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03e57b85361e6fcc7d5405867eb036c10969518fb8173e3d6744574f97954766" +dependencies = [ + "bincode", + "bytes", + "reqwest", + "serde", + "serde_json", + "sp1-core-executor", + "sp1-core-machine", + "sp1-primitives", + "sp1-prover", + "sp1-prover-types", + "thiserror 1.0.69", + "tokio", + "tracing", +] + +[[package]] +name = "sp1-curves" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eabe28a711559675f1addb4a529159c5db383a79f62819dffd4349ccb4e979e" +dependencies = [ + "cfg-if", + "dashu", + "elliptic-curve", + "generic-array 1.1.0", + "itertools 0.14.0", + "k256", + "num", + "p256", + "serde", + "slop-algebra", + "snowbridge-amcl", + "sp1-primitives", + "typenum", +] + +[[package]] +name = "sp1-derive" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09bb8b5d4eade7611018a28063f32a73f5c59bc1b29a8e517413dca66084ca0f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "sp1-hypercube" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03ac19804d8b1bf955fb2fd7722c9b12bcbe9343a0e4b88ecf607a3e85ae63cd" +dependencies = [ + "arrayref", + "deepsize2", + "derive-where", + "futures", + "hashbrown 0.14.5", + "itertools 0.14.0", + "num-bigint 0.4.6", + "num-traits", + "num_cpus", + "rayon", + "rayon-scan", + "serde", + "slop-air", + "slop-algebra", + "slop-alloc", + "slop-basefold", + "slop-basefold-prover", + "slop-bn254", + "slop-challenger", + "slop-commit", + "slop-futures", + "slop-jagged", + "slop-koala-bear", + "slop-matrix", + "slop-merkle-tree", + "slop-multilinear", + "slop-poseidon2", + "slop-stacked", + "slop-sumcheck", + "slop-symmetric", + "slop-tensor", + "slop-uni-stark", + "slop-whir", + "sp1-derive", + "sp1-primitives", + "strum", + "thiserror 1.0.69", + "thousands", + "tokio", + "tracing", +] + +[[package]] +name = "sp1-jit" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fb1eff715595ef7059f2db3845f941bbaf5c2635e2b6b0fe0b0d982d4422f6a" +dependencies = [ + "dynasmrt", + "hashbrown 0.14.5", + "memfd", + "memmap2", + "serde", + "tracing", + "uuid", +] + +[[package]] +name = "sp1-lib" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c49bc98323d52ec8bef7ae7db15fa095182edfdc2e7d9123f0c57173014e48" +dependencies = [ + "bincode", + "serde", + "sp1-primitives", +] + +[[package]] +name = "sp1-primitives" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04953c36911214897091107e2a3443fcf531892b0883ce57d4a2eea65d28c72b" +dependencies = [ + "bincode", + "blake3", + "elf", + "hex", + "itertools 0.14.0", + "lazy_static", + "num-bigint 0.4.6", + "serde", + "sha2", + "slop-algebra", + "slop-bn254", + "slop-challenger", + "slop-koala-bear", + "slop-poseidon2", + "slop-primitives", + "slop-symmetric", +] + +[[package]] +name = "sp1-prover" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a47f86dbe432038fed00fd869b0937d11b87abdb0e34a676aaeb5a723f1e31e3" +dependencies = [ + "anyhow", + "bincode", + "clap", + "dirs", + "downloader", + "either", + "enum-map", + "eyre", + "futures", + "hashbrown 0.14.5", + "hex", + "indicatif", + "itertools 0.14.0", + "lru", + "mti", + "num-bigint 0.4.6", + "opentelemetry", + "pin-project", + "rand 0.8.5", + "reqwest", + "serde", + "serde_json", + "serial_test", + "sha2", + "slop-air", + "slop-algebra", + "slop-basefold", + "slop-bn254", + "slop-challenger", + "slop-futures", + "slop-jagged", + "slop-multilinear", + "slop-stacked", + "slop-symmetric", + "sp1-core-executor", + "sp1-core-machine", + "sp1-derive", + "sp1-hypercube", + "sp1-jit", + "sp1-primitives", + "sp1-prover-types", + "sp1-recursion-circuit", + "sp1-recursion-compiler", + "sp1-recursion-executor", + "sp1-recursion-gnark-ffi", + "sp1-recursion-machine", + "sp1-verifier", + "static_assertions", + "sysinfo", + "tempfile", + "thiserror 1.0.69", + "tokio", + "tonic", + "tracing", + "tracing-appender", + "tracing-subscriber", +] + +[[package]] +name = "sp1-prover-types" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9de603ae06be908cca9c4e4117b5e3aceaf4e1b6b9a98b3892ec15a4a865c80f" +dependencies = [ + "anyhow", + "async-scoped", + "bincode", + "chrono", + "futures-util", + "hashbrown 0.14.5", + "mti", + "prost", + "serde", + "tokio", + "tonic", + "tonic-build", + "tracing", +] + +[[package]] +name = "sp1-recursion-circuit" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebf27cc0c97c8280ac5fd475112cf48cd31503da0d56dc902ed46a7d95232b28" +dependencies = [ + "bincode", + "itertools 0.14.0", + "rand 0.8.5", + "rayon", + "serde", + "slop-air", + "slop-algebra", + "slop-alloc", + "slop-basefold", + "slop-basefold-prover", + "slop-bn254", + "slop-challenger", + "slop-commit", + "slop-jagged", + "slop-koala-bear", + "slop-matrix", + "slop-merkle-tree", + "slop-multilinear", + "slop-stacked", + "slop-sumcheck", + "slop-symmetric", + "slop-tensor", + "slop-whir", + "sp1-core-executor", + "sp1-core-machine", + "sp1-derive", + "sp1-hypercube", + "sp1-primitives", + "sp1-recursion-compiler", + "sp1-recursion-executor", + "sp1-recursion-machine", + "tracing", +] + +[[package]] +name = "sp1-recursion-compiler" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80c70432e7cc894a893a07d49d65149d99ad7deacb89502ec20f135c2f36ab7a" +dependencies = [ + "backtrace", + "cfg-if", + "itertools 0.14.0", + "serde", + "slop-algebra", + "slop-bn254", + "slop-symmetric", + "sp1-core-machine", + "sp1-hypercube", + "sp1-primitives", + "sp1-recursion-executor", + "tracing", + "vec_map", +] + +[[package]] +name = "sp1-recursion-executor" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07d09ed74240eddaaad86945602eda7c35ea90f969de9d7fd4faa9442ab7878c" +dependencies = [ + "backtrace", + "cfg-if", + "hashbrown 0.14.5", + "itertools 0.14.0", + "range-set-blaze", + "serde", + "slop-algebra", + "slop-maybe-rayon", + "slop-poseidon2", + "slop-symmetric", + "smallvec", + "sp1-derive", + "sp1-hypercube", + "static_assertions", + "thiserror 1.0.69", + "tracing", +] + +[[package]] +name = "sp1-recursion-gnark-ffi" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7b67bd9a9dcd038e68fa4a3272a78e2d3098df05250bb78857aa17fef411563" +dependencies = [ + "anyhow", + "bincode", + "bindgen", + "cfg-if", + "hex", + "num-bigint 0.4.6", + "serde", + "serde_json", + "sha2", + "slop-algebra", + "slop-symmetric", + "sp1-hypercube", + "sp1-primitives", + "sp1-recursion-compiler", + "sp1-verifier", + "tempfile", + "tracing", +] + +[[package]] +name = "sp1-recursion-machine" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db4903ee3895f7cbffe8f5624e516debd2ef1a4db5b01f75ca1fe3e84b12f5a6" +dependencies = [ + "itertools 0.14.0", + "rand 0.8.5", + "slop-air", + "slop-algebra", + "slop-basefold", + "slop-matrix", + "slop-maybe-rayon", + "slop-symmetric", + "sp1-derive", + "sp1-hypercube", + "sp1-primitives", + "sp1-recursion-executor", + "strum", + "tracing", + "zkhash", +] + +[[package]] +name = "sp1-sdk" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "378f702c65ac9bea522fdde527f0260a95af155aa10d089e1e9e6ba660b60f50" +dependencies = [ + "anyhow", + "async-trait", + "bincode", + "cfg-if", + "dirs", + "eventsource-stream", + "futures", + "hex", + "indicatif", + "itertools 0.14.0", + "k256", + "num-bigint 0.4.6", + "serde", + "sha2", + "slop-algebra", + "slop-alloc", + "slop-basefold", + "slop-commit", + "slop-jagged", + "slop-merkle-tree", + "slop-multilinear", + "slop-stacked", + "slop-sumcheck", + "slop-tensor", + "sp1-build", + "sp1-core-executor", + "sp1-core-machine", + "sp1-cuda", + "sp1-hypercube", + "sp1-primitives", + "sp1-prover", + "sp1-prover-types", + "sp1-recursion-executor", + "sp1-recursion-gnark-ffi", + "sp1-verifier", + "strum", + "tempfile", + "thiserror 1.0.69", + "tokio", + "tracing", +] + +[[package]] +name = "sp1-verifier" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1942e85d450056725480ac900711869fe1ae453a4e069bcabff3ee7791773e62" +dependencies = [ + "bincode", + "blake3", + "cfg-if", + "dirs", + "hex", + "lazy_static", + "serde", + "sha2", + "slop-algebra", + "slop-challenger", + "slop-primitives", + "slop-symmetric", + "sp1-hypercube", + "sp1-primitives", + "sp1-recursion-executor", + "sp1-recursion-machine", + "strum", + "substrate-bn-succinct-rs", + "thiserror 2.0.18", +] + +[[package]] +name = "sp1-zkvm" +version = "6.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdf86a2a275e6788a1b34d71bc607fa5d5452d0149a15d34f7945f005ad6e37" +dependencies = [ + "cfg-if", + "critical-section", + "embedded-alloc", + "getrandom 0.2.17", + "getrandom 0.3.4", + "lazy_static", + "libm", + "rand 0.8.5", + "sha2", + "sp1-lib", + "sp1-primitives", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "subenum" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3d08fe7078c57309d5c3d938e50eba95ba1d33b9c3a101a8465fc6861a5416" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "substrate-bn-succinct-rs" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a241fd7c1016fb8ad30fcf5a20986c0c4538e8f15a1b41a1761516299e377ec1" +dependencies = [ + "bytemuck", + "byteorder", + "cfg-if", + "crunchy", + "lazy_static", + "num-bigint 0.4.6", + "rand 0.8.5", + "rustc-hex", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "svgbobdoc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2c04b93fc15d79b39c63218f15e3fdffaa4c227830686e3b7c5f41244eb3e50" +dependencies = [ + "base64 0.13.1", + "proc-macro2", + "quote", + "syn 1.0.109", + "unicode-width 0.1.14", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "sysinfo" +version = "0.30.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "windows", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "tempfile" +version = "3.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" +dependencies = [ + "fastrand", + "getrandom 0.4.1", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "thousands" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.49.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +dependencies = [ + "bytes", + "libc", + "mio", + "pin-project-lite", + "signal-hook-registry", + "socket2 0.6.2", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-macros" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" + +[[package]] +name = "toml_datetime" +version = "0.7.5+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_edit" +version = "0.19.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" +dependencies = [ + "indexmap 2.13.0", + "toml_datetime 0.6.11", + "winnow 0.5.40", +] + +[[package]] +name = "toml_edit" +version = "0.23.10+spec-1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +dependencies = [ + "indexmap 2.13.0", + "toml_datetime 0.7.5+spec-1.1.0", + "toml_parser", + "winnow 0.7.14", +] + +[[package]] +name = "toml_parser" +version = "1.0.9+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +dependencies = [ + "winnow 0.7.14", +] + +[[package]] +name = "tonic" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.22.1", + "bytes", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost", + "rustls-pemfile", + "socket2 0.5.10", + "tokio", + "tokio-rustls", + "tokio-stream", + "tower 0.4.13", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "prost-types", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand 0.8.5", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "iri-string", + "pin-project-lite", + "tower 0.5.3", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-appender" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "786d480bce6247ab75f005b14ae1624ad978d3029d9113f0a22fa1ac773faeaf" +dependencies = [ + "crossbeam-channel", + "thiserror 2.0.18", + "time", + "tracing-subscriber", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-forest" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee40835db14ddd1e3ba414292272eddde9dad04d3d4b65509656414d1c42592f" +dependencies = [ + "ansi_term", + "smallvec", + "thiserror 1.0.69", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typeid_prefix" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9da1387307fdee46aa441e4f08a1b491e659fcac1aca9cd71f2c624a0de5d1b" + +[[package]] +name = "typeid_suffix" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77b55e96f110c6db5d1a2f24072552537f0091dc90cebeaa679540bac93e7405" +dependencies = [ + "uuid", +] + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +dependencies = [ + "atomic", + "getrandom 0.4.1", + "js-sys", + "md-5", + "sha1_smol", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +dependencies = [ + "serde", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.113" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a" +dependencies = [ + "cfg-if", + "futures-util", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.113" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.113" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn 2.0.117", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.113" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.0", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap 2.13.0", + "semver", +] + +[[package]] +name = "web-sys" +version = "0.3.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core 0.52.0", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "winnow" +version = "0.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +dependencies = [ + "memchr", +] + +[[package]] +name = "winnow" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.13.0", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap 2.13.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zkhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4352d1081da6922701401cdd4cbf29a2723feb4cfabb5771f6fee8e9276da1c7" +dependencies = [ + "ark-ff", + "ark-std", + "bitvec", + "blake2", + "bls12_381", + "byteorder", + "cfg-if", + "group 0.12.1", + "group 0.13.0", + "halo2", + "hex", + "jubjub", + "lazy_static", + "pasta_curves 0.5.1", + "rand 0.8.5", + "serde", + "sha2", + "sha3", + "subtle", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/bench_vs/sp1/fibonacci/Cargo.toml b/bench_vs/sp1/fibonacci/Cargo.toml new file mode 100644 index 000000000..fc24039c2 --- /dev/null +++ b/bench_vs/sp1/fibonacci/Cargo.toml @@ -0,0 +1,3 @@ +[workspace] +members = ["program", "script"] +resolver = "2" diff --git a/bench_vs/sp1/fibonacci/program/Cargo.toml b/bench_vs/sp1/fibonacci/program/Cargo.toml new file mode 100644 index 000000000..551be48b5 --- /dev/null +++ b/bench_vs/sp1/fibonacci/program/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "fibonacci-program" +version = "0.1.0" +edition = "2021" + +[dependencies] +sp1-zkvm = "6.0.1" diff --git a/bench_vs/sp1/fibonacci/program/src/main.rs b/bench_vs/sp1/fibonacci/program/src/main.rs new file mode 100644 index 000000000..571e157b3 --- /dev/null +++ b/bench_vs/sp1/fibonacci/program/src/main.rs @@ -0,0 +1,14 @@ +#![no_main] +sp1_zkvm::entrypoint!(main); + +pub fn main() { + let n: u64 = sp1_zkvm::io::read::(); + let mut a: u64 = 0; + let mut b: u64 = 1; + for _ in 0..n { + let c = a.wrapping_add(b); + a = b; + b = c; + } + sp1_zkvm::io::commit(&b); +} diff --git a/bench_vs/sp1/fibonacci/rust-toolchain b/bench_vs/sp1/fibonacci/rust-toolchain new file mode 100644 index 000000000..9397b9526 --- /dev/null +++ b/bench_vs/sp1/fibonacci/rust-toolchain @@ -0,0 +1,3 @@ +[toolchain] +channel = "stable" +components = ["llvm-tools", "rustc-dev"] diff --git a/bench_vs/sp1/fibonacci/script/Cargo.toml b/bench_vs/sp1/fibonacci/script/Cargo.toml new file mode 100644 index 000000000..b72b33517 --- /dev/null +++ b/bench_vs/sp1/fibonacci/script/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "fibonacci-script" +version = "0.1.0" +edition = "2021" + +[dependencies] +sp1-sdk = { version = "6.0.1", features = ["blocking"] } + +[build-dependencies] +sp1-build = "6.0.1" diff --git a/bench_vs/sp1/fibonacci/script/build.rs b/bench_vs/sp1/fibonacci/script/build.rs new file mode 100644 index 000000000..d6cf925d6 --- /dev/null +++ b/bench_vs/sp1/fibonacci/script/build.rs @@ -0,0 +1,5 @@ +use sp1_build::build_program_with_args; + +fn main() { + build_program_with_args("../program", Default::default()); +} diff --git a/bench_vs/sp1/fibonacci/script/src/main.rs b/bench_vs/sp1/fibonacci/script/src/main.rs new file mode 100644 index 000000000..761d0c911 --- /dev/null +++ b/bench_vs/sp1/fibonacci/script/src/main.rs @@ -0,0 +1,47 @@ +use sp1_sdk::blocking::{ProveRequest, Prover, ProverClient}; +use sp1_sdk::{include_elf, ProvingKey, SP1Stdin}; +use std::time::Instant; + +const FIB_ELF: sp1_sdk::Elf = include_elf!("fibonacci-program"); + +fn main() { + sp1_sdk::utils::setup_logger(); + + let n: u64 = std::env::args() + .nth(1) + .expect("Usage: fibonacci-script ") + .parse() + .expect("n must be a u64"); + + let client = ProverClient::from_env(); + let mut stdin = SP1Stdin::new(); + stdin.write(&n); + + // Setup + let pk = client.setup(FIB_ELF.clone()).expect("setup failed"); + + // Execute for cycle count + let (_, report) = client + .execute(FIB_ELF.clone(), stdin.clone()) + .run() + .unwrap(); + println!("Cycles: {}", report.total_instruction_count()); + + // Core proof (no recursion) + let start = Instant::now(); + let proof = client + .prove(&pk, stdin) + .core() + .run() + .expect("prove failed"); + let elapsed = start.elapsed(); + + println!("Proving time: {:.3}s", elapsed.as_secs_f64()); + + // Verify + client + .verify(&proof, pk.verifying_key(), None) + .expect("verify failed"); + + println!("Proof verified successfully"); +} From 472ddc394a5a9cd5346fd435bdb69add8574f4c6 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 25 Feb 2026 19:22:16 -0300 Subject: [PATCH 02/34] Standarize guest --- bench_vs/README.md | 4 ++-- bench_vs/run.sh | 36 ++++++++++-------------------------- 2 files changed, 12 insertions(+), 28 deletions(-) diff --git a/bench_vs/README.md b/bench_vs/README.md index 0a20304c3..1be30c5d2 100644 --- a/bench_vs/README.md +++ b/bench_vs/README.md @@ -15,9 +15,9 @@ Compares proving time for an identical u64 wrapping Fibonacci computation. sp1up ``` -3. **RISC-V assembler** — Homebrew clang + ld.lld (macOS): +3. **Rust nightly** (for cross-compiling Lambda VM guest): ```bash - brew install llvm + rustup toolchain install nightly ``` ## Usage diff --git a/bench_vs/run.sh b/bench_vs/run.sh index 1575e62a3..95b84d4d6 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -9,8 +9,7 @@ # Prerequisites: # - Lambda VM CLI built: cargo build --release -p cli # - SP1 toolchain installed: curl -L https://sp1up.succinct.xyz | bash && sp1up -# - clang with RISC-V target support (macOS Homebrew clang works) -# - ld.lld linker +# - Rust nightly toolchain: rustup toolchain install nightly set -euo pipefail @@ -65,6 +64,9 @@ rm -rf "$TMP_DIR" && mkdir -p "$TMP_DIR" # --- Pre-build --------------------------------------------------------------- CLI="$ROOT_DIR/target/release/cli" +LAMBDA_DIR="$SCRIPT_DIR/lambda/fibonacci" +TARGET_SPEC="$ROOT_DIR/executor/programs/riscv64im-lambda-vm-elf.json" + if $RUN_LAMBDA && [ ! -f "$CLI" ]; then echo -e "${YELLOW}[Lambda VM] CLI not found, building...${NC}" cargo build --release -p cli --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1 @@ -97,32 +99,14 @@ run_one() { local sp1_cycles="" if $RUN_LAMBDA; then - # Generate assembly - cat > "$TMP_DIR/fib.s" <&1 | tail -1) + LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench" echo -e " ${GREEN}[Lambda VM] Proving...${NC}" - LAMBDA_OUTPUT=$("$CLI" prove "$TMP_DIR/fib.elf" -o "$TMP_DIR/lambda_proof.bin" --time 2>/dev/null) + LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>/dev/null) lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC}" fi From 88171d01081e8d9ebde34497456d9ddcb4bc5b40 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 25 Feb 2026 19:35:36 -0300 Subject: [PATCH 03/34] Fix decimals --- bench_vs/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench_vs/run.sh b/bench_vs/run.sh index 95b84d4d6..ed0b519df 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -156,7 +156,7 @@ for i in "${!RESULT_N[@]}"; do if $RUN_LAMBDA && $RUN_SP1; then if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then - RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $st / $lt}") + RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.2fx\", $st / $lt}") if (( $(LC_NUMERIC=C awk "BEGIN {print ($st > $lt)}") )); then RATIO="${GREEN}${RATIO}${NC}" else From ef7d5bb6a62990733dd2959689803f64efcd1762 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 25 Feb 2026 19:36:24 -0300 Subject: [PATCH 04/34] Benchmark vs sp1 --- bench_vs/run.sh | 8 +-- bench_vs/sp1/fibonacci/.tldr/daemon.pid | 1 - bench_vs/sp1/fibonacci/.tldr/status | 1 - bench_vs/sp1/fibonacci/.tldrignore | 84 ------------------------- 4 files changed, 4 insertions(+), 90 deletions(-) delete mode 100644 bench_vs/sp1/fibonacci/.tldr/daemon.pid delete mode 100644 bench_vs/sp1/fibonacci/.tldr/status delete mode 100644 bench_vs/sp1/fibonacci/.tldrignore diff --git a/bench_vs/run.sh b/bench_vs/run.sh index ed0b519df..113666124 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -156,11 +156,11 @@ for i in "${!RESULT_N[@]}"; do if $RUN_LAMBDA && $RUN_SP1; then if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then - RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.2fx\", $st / $lt}") - if (( $(LC_NUMERIC=C awk "BEGIN {print ($st > $lt)}") )); then - RATIO="${GREEN}${RATIO}${NC}" - else + RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $lt / $st}") + if (( $(LC_NUMERIC=C awk "BEGIN {print ($lt > $st)}") )); then RATIO="${RED}${RATIO}${NC}" + else + RATIO="${GREEN}${RATIO}${NC}" fi printf " %-10s %11ss %11ss " "$n" "$lt" "$st" echo -e "$RATIO" diff --git a/bench_vs/sp1/fibonacci/.tldr/daemon.pid b/bench_vs/sp1/fibonacci/.tldr/daemon.pid deleted file mode 100644 index 10eda36c4..000000000 --- a/bench_vs/sp1/fibonacci/.tldr/daemon.pid +++ /dev/null @@ -1 +0,0 @@ -39495 \ No newline at end of file diff --git a/bench_vs/sp1/fibonacci/.tldr/status b/bench_vs/sp1/fibonacci/.tldr/status deleted file mode 100644 index ad50b5340..000000000 --- a/bench_vs/sp1/fibonacci/.tldr/status +++ /dev/null @@ -1 +0,0 @@ -ready \ No newline at end of file diff --git a/bench_vs/sp1/fibonacci/.tldrignore b/bench_vs/sp1/fibonacci/.tldrignore deleted file mode 100644 index e01df83cb..000000000 --- a/bench_vs/sp1/fibonacci/.tldrignore +++ /dev/null @@ -1,84 +0,0 @@ -# TLDR ignore patterns (gitignore syntax) -# Auto-generated - review and customize for your project -# Docs: https://git-scm.com/docs/gitignore - -# =================== -# Dependencies -# =================== -node_modules/ -.venv/ -venv/ -env/ -__pycache__/ -.tox/ -.nox/ -.pytest_cache/ -.mypy_cache/ -.ruff_cache/ -vendor/ -Pods/ - -# =================== -# Build outputs -# =================== -dist/ -build/ -out/ -target/ -*.egg-info/ -*.whl -*.pyc -*.pyo - -# =================== -# Binary/large files -# =================== -*.so -*.dylib -*.dll -*.exe -*.bin -*.o -*.a -*.lib - -# =================== -# IDE/editors -# =================== -.idea/ -.vscode/ -*.swp -*.swo -*~ - -# =================== -# Security (always exclude) -# =================== -.env -.env.* -*.pem -*.key -*.p12 -*.pfx -credentials.* -secrets.* - -# =================== -# Version control -# =================== -.git/ -.hg/ -.svn/ - -# =================== -# OS files -# =================== -.DS_Store -Thumbs.db - -# =================== -# Project-specific -# Add your custom patterns below -# =================== -# large_test_fixtures/ -# data/ From e11346a084bcfe9781a184679420a3161a9ab39e Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 25 Feb 2026 20:19:15 -0300 Subject: [PATCH 05/34] Add lambda program --- bench_vs/lambda/fibonacci/.cargo/config.toml | 6 ++++ bench_vs/lambda/fibonacci/Cargo.lock | 7 ++++ bench_vs/lambda/fibonacci/Cargo.toml | 8 +++++ bench_vs/lambda/fibonacci/build.rs | 10 ++++++ bench_vs/lambda/fibonacci/src/main.rs | 35 ++++++++++++++++++++ 5 files changed, 66 insertions(+) create mode 100644 bench_vs/lambda/fibonacci/.cargo/config.toml create mode 100644 bench_vs/lambda/fibonacci/Cargo.lock create mode 100644 bench_vs/lambda/fibonacci/Cargo.toml create mode 100644 bench_vs/lambda/fibonacci/build.rs create mode 100644 bench_vs/lambda/fibonacci/src/main.rs diff --git a/bench_vs/lambda/fibonacci/.cargo/config.toml b/bench_vs/lambda/fibonacci/.cargo/config.toml new file mode 100644 index 000000000..be730c3ec --- /dev/null +++ b/bench_vs/lambda/fibonacci/.cargo/config.toml @@ -0,0 +1,6 @@ +[target.riscv64im-lambda-vm-elf] +rustflags = [ + "-C", "link-arg=-e", + "-C", "link-arg=main", + "-C", "passes=lower-atomic" +] diff --git a/bench_vs/lambda/fibonacci/Cargo.lock b/bench_vs/lambda/fibonacci/Cargo.lock new file mode 100644 index 000000000..3a4bb7634 --- /dev/null +++ b/bench_vs/lambda/fibonacci/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "fibonacci-bench" +version = "0.1.0" diff --git a/bench_vs/lambda/fibonacci/Cargo.toml b/bench_vs/lambda/fibonacci/Cargo.toml new file mode 100644 index 000000000..8ce06fec5 --- /dev/null +++ b/bench_vs/lambda/fibonacci/Cargo.toml @@ -0,0 +1,8 @@ +[workspace] + +[package] +name = "fibonacci-bench" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/bench_vs/lambda/fibonacci/build.rs b/bench_vs/lambda/fibonacci/build.rs new file mode 100644 index 000000000..5c189eadb --- /dev/null +++ b/bench_vs/lambda/fibonacci/build.rs @@ -0,0 +1,10 @@ +use std::env; +use std::fs; +use std::path::Path; + +fn main() { + let n = env::var("BENCH_N").unwrap_or_else(|_| "1000".to_string()); + let out_dir = env::var("OUT_DIR").unwrap(); + fs::write(Path::new(&out_dir).join("n.txt"), &n).unwrap(); + println!("cargo:rerun-if-env-changed=BENCH_N"); +} diff --git a/bench_vs/lambda/fibonacci/src/main.rs b/bench_vs/lambda/fibonacci/src/main.rs new file mode 100644 index 000000000..8f54cf604 --- /dev/null +++ b/bench_vs/lambda/fibonacci/src/main.rs @@ -0,0 +1,35 @@ +#![no_std] +#![no_main] + +use core::panic::PanicInfo; + +#[panic_handler] +fn panic(_info: &PanicInfo) -> ! { + loop {} +} + +const N: u64 = include!(concat!(env!("OUT_DIR"), "/n.txt")); + +#[inline(never)] +fn halt(code: u64) -> ! { + unsafe { + core::arch::asm!( + "ecall", + in("a0") code, + in("a7") 5u64, + options(noreturn), + ); + } +} + +#[unsafe(no_mangle)] +pub fn main() -> ! { + let mut a: u64 = 0; + let mut b: u64 = 1; + for _ in 0..N { + let c = a.wrapping_add(b); + a = b; + b = c; + } + halt(b) +} From 687d8d87009deac5cb3b5a7823742f916efd108e Mon Sep 17 00:00:00 2001 From: MauroFab Date: Wed, 25 Feb 2026 20:20:41 -0300 Subject: [PATCH 06/34] Remove stray blank line from .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3ef9f8283..9c826f0d9 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,3 @@ executor/program_artifacts/ # Shared cargo target directory for ELF builds executor/shared_target/ - From 00696591cf985633c3f939e5c06a7df753ad690a Mon Sep 17 00:00:00 2001 From: Mauro Toscano <12560266+MauroToscano@users.noreply.github.com> Date: Mon, 2 Mar 2026 11:12:21 -0300 Subject: [PATCH 07/34] Apply suggestion from @gabrielbosio Co-authored-by: Gabriel Bosio <38794644+gabrielbosio@users.noreply.github.com> --- bench_vs/run.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bench_vs/run.sh b/bench_vs/run.sh index 113666124..4aa249a5d 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -102,7 +102,8 @@ run_one() { echo -e " ${GREEN}[Lambda VM] Building (n=${N})...${NC}" (cd "$LAMBDA_DIR" && BENCH_N="$N" cargo +nightly build --release \ --target "$TARGET_SPEC" \ - -Z build-std=core -Z build-std-features=compiler-builtins-mem 2>&1 | tail -1) + -Z build-std=core -Z build-std-features=compiler-builtins-mem 2>&1 \ + -Z json-target-spec 2>&1 | tail -1) LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench" echo -e " ${GREEN}[Lambda VM] Proving...${NC}" From dabe54c5faf9ae4ddc58d492281b6709b4136571 Mon Sep 17 00:00:00 2001 From: MauroFab Date: Mon, 2 Mar 2026 16:57:38 -0300 Subject: [PATCH 08/34] Add instruments --- Cargo.toml | 5 + bin/cli/Cargo.toml | 1 + crypto/stark/src/constraints/evaluator.rs | 36 ---- crypto/stark/src/instruments.rs | 127 ++++++++++++++ crypto/stark/src/lib.rs | 2 + crypto/stark/src/prover.rs | 181 +++++++++++++++---- prover/Cargo.toml | 5 + prover/benches/profile_vm_prover.rs | 10 +- prover/src/instruments.rs | 204 ++++++++++++++++++++++ prover/src/lib.rs | 43 +++++ 10 files changed, 538 insertions(+), 76 deletions(-) create mode 100644 crypto/stark/src/instruments.rs create mode 100644 prover/src/instruments.rs diff --git a/Cargo.toml b/Cargo.toml index 577ab04c4..e24fd3bfc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,8 @@ resolver = "2" [profile.dev] opt-level = 3 debug = true + +# debug=1 = line tables only: enables function names in profilers (samply, perf) +# without slowing compilation or bloating the binary significantly. +[profile.release] +debug = 1 diff --git a/bin/cli/Cargo.toml b/bin/cli/Cargo.toml index 8eb62c86f..602e58f5f 100644 --- a/bin/cli/Cargo.toml +++ b/bin/cli/Cargo.toml @@ -15,3 +15,4 @@ tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true } [features] jemalloc-stats = ["dep:tikv-jemalloc-ctl"] +instruments = ["prover/instruments"] diff --git a/crypto/stark/src/constraints/evaluator.rs b/crypto/stark/src/constraints/evaluator.rs index 908d7b950..2c46e334b 100644 --- a/crypto/stark/src/constraints/evaluator.rs +++ b/crypto/stark/src/constraints/evaluator.rs @@ -18,8 +18,6 @@ use rayon::{ }; use std::marker::PhantomData; -#[cfg(feature = "instruments")] -use std::time::Instant; pub struct ConstraintEvaluator< Field: IsSubFieldOf + IsFFTField + Send + Sync, @@ -226,9 +224,6 @@ where #[cfg(all(debug_assertions, not(feature = "parallel")))] let boundary_polys: Vec>> = Vec::new(); - #[cfg(feature = "instruments")] - let timer = Instant::now(); - let trace_length = domain.interpolation_domain_size; let lde_periodic_columns = air .get_periodic_column_polynomials(trace_length) @@ -244,15 +239,6 @@ where .collect::>>, FFTError>>() .unwrap(); - #[cfg(feature = "instruments")] - println!( - " Evaluating periodic columns on lde: {:#?}", - timer.elapsed() - ); - - #[cfg(feature = "instruments")] - let timer = Instant::now(); - // Fused boundary evaluation: compute (trace[col] - value) on-the-fly // instead of pre-computing all boundary_polys_evaluations. // This eliminates N_constraints × LDE_size intermediate allocations. @@ -282,12 +268,6 @@ where }) .collect(); - #[cfg(feature = "instruments")] - println!( - " Evaluated boundary polynomials on LDE: {:#?}", - timer.elapsed() - ); - #[cfg(all(debug_assertions, not(feature = "parallel")))] let boundary_zerofiers = Vec::new(); @@ -297,22 +277,12 @@ where #[cfg(all(debug_assertions, not(feature = "parallel")))] let _transition_evaluations: Vec> = Vec::new(); - #[cfg(feature = "instruments")] - let timer = Instant::now(); let zerofier_data = air.transition_zerofier_evaluations_grouped(domain); - #[cfg(feature = "instruments")] - println!( - " Evaluated transition zerofiers: {:#?}", - timer.elapsed() - ); // Iterate over all LDE domain and compute the part of the composition polynomial // related to the transition constraints and add it to the already computed part of the // boundary constraints. - #[cfg(feature = "instruments")] - let timer = Instant::now(); - let num_transition = air.num_transition_constraints(); let num_periodic = lde_periodic_columns.len(); let offsets = &air.context().transition_offsets; @@ -330,12 +300,6 @@ where offsets, ); - #[cfg(feature = "instruments")] - println!( - " Evaluated transitions and accumulated results: {:#?}", - timer.elapsed() - ); - evaluations_t } } diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs new file mode 100644 index 000000000..11ac350af --- /dev/null +++ b/crypto/stark/src/instruments.rs @@ -0,0 +1,127 @@ +use std::cell::RefCell; +use std::time::Duration; + +/// Sub-operation timing breakdown for a single table in Rounds 2-4. +#[derive(Clone, Debug, Default)] +pub struct TableSubOps { + /// reconstruct_round1 (expand_pool_to_lde) + pub trace_lde: Duration, + /// evaluator.evaluate() + pub constraints: Duration, + /// decompose_and_extend_d2 + pub comp_decompose: Duration, + /// commit_composition_polynomial + pub comp_commit: Duration, + /// Round 3: barycentric OOD evaluation + pub ood: Duration, + /// Round 4: compute_deep_composition_poly_evaluations + pub deep_comp: Duration, + /// Round 4: interpolate_fft + evaluate_fft + pub deep_extend: Duration, + /// fri::commit_phase_from_evaluations + pub fri_commit: Duration, + /// Round 4: grinding + FRI query + Merkle openings + pub queries: Duration, +} + +/// Sub-operation breakdown for Round 1 aux commit pass. +#[derive(Clone, Debug, Default)] +pub struct Round1SubOps { + /// Main trace: expand_pool_to_lde (LDE/FFT) + pub main_lde: Duration, + /// Main trace: commit_columns_bit_reversed (Merkle) + pub main_merkle: Duration, + /// Aux trace: expand_pool_to_lde (LDE/FFT) + pub aux_lde: Duration, + /// Aux trace: commit_columns_bit_reversed (Merkle) + pub aux_merkle: Duration, +} + +/// Timing data collected inside `multi_prove`. +pub struct MultiProveTiming { + pub prepass: Duration, + pub main_commits: Duration, + pub aux_build: Duration, + pub aux_commit: Duration, + pub rounds_2_4: Duration, + /// Sub-op breakdown for Round 1 (main + aux LDE vs Merkle). + pub round1_sub: Round1SubOps, + /// (name, rows, duration, sub_ops) per table for rounds 2-4. + pub table_timings: Vec<(String, usize, Duration, TableSubOps)>, +} + +thread_local! { + static TIMING_DATA: RefCell> = const { RefCell::new(None) }; + /// Round 1 sub-timings accumulated across the main-commit and aux-commit loops. + static R1_SUB: RefCell = const { RefCell::new(Round1SubOps { + main_lde: Duration::ZERO, main_merkle: Duration::ZERO, + aux_lde: Duration::ZERO, aux_merkle: Duration::ZERO, + }) }; + /// Round 2 sub-timings: (constraints, fft, merkle) + static R2_SUB: RefCell> = const { RefCell::new(None) }; + /// Round 4 sub-timings: (fft, merkle, deep_comp, queries) + static R4_SUB: RefCell> = const { RefCell::new(None) }; + /// Assembled sub-ops from prove_rounds_2_to_4 (without reconstruct_round1 LDE time). + static ROUND_SUB_OPS: RefCell> = const { RefCell::new(None) }; +} + +pub fn store(data: MultiProveTiming) { + TIMING_DATA.with(|cell| { + *cell.borrow_mut() = Some(data); + }); +} + +pub fn take() -> Option { + TIMING_DATA.with(|cell| cell.borrow_mut().take()) +} + +pub fn accum_r1_main(lde: Duration, merkle: Duration) { + R1_SUB.with(|cell| { + let mut s = cell.borrow_mut(); + s.main_lde += lde; + s.main_merkle += merkle; + }); +} + +pub fn accum_r1_aux(lde: Duration, merkle: Duration) { + R1_SUB.with(|cell| { + let mut s = cell.borrow_mut(); + s.aux_lde += lde; + s.aux_merkle += merkle; + }); +} + +pub fn take_r1_sub() -> Round1SubOps { + R1_SUB.with(|cell| { + std::mem::replace( + &mut *cell.borrow_mut(), + Round1SubOps::default(), + ) + }) +} + +pub fn store_r2_sub(constraints: Duration, fft: Duration, merkle: Duration) { + R2_SUB.with(|cell| *cell.borrow_mut() = Some((constraints, fft, merkle))); +} + +pub fn take_r2_sub() -> Option<(Duration, Duration, Duration)> { + R2_SUB.with(|cell| cell.borrow_mut().take()) +} + +pub fn store_r4_sub(fft: Duration, merkle: Duration, deep_comp: Duration, queries: Duration) { + R4_SUB.with(|cell| *cell.borrow_mut() = Some((fft, merkle, deep_comp, queries))); +} + +pub fn take_r4_sub() -> Option<(Duration, Duration, Duration, Duration)> { + R4_SUB.with(|cell| cell.borrow_mut().take()) +} + +pub fn store_round_sub_ops(data: TableSubOps) { + ROUND_SUB_OPS.with(|cell| { + *cell.borrow_mut() = Some(data); + }); +} + +pub fn take_round_sub_ops() -> Option { + ROUND_SUB_OPS.with(|cell| cell.borrow_mut().take()) +} diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs index d8f293589..0415572af 100644 --- a/crypto/stark/src/lib.rs +++ b/crypto/stark/src/lib.rs @@ -1,6 +1,8 @@ #[cfg(feature = "debug-checks")] pub mod bus_debug; pub mod constraints; +#[cfg(feature = "instruments")] +pub mod instruments; pub mod context; pub mod debug; pub mod domain; diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 085bcb03d..c0223d67a 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -445,10 +445,18 @@ pub trait IsStarkProver< { let num_cols = trace.num_main_columns; trace.extract_columns_main_into(main_pool); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); Self::expand_pool_to_lde::(main_pool, num_cols, domain, twiddles); + #[cfg(feature = "instruments")] + let main_lde_dur = t_sub.elapsed(); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let (tree, root) = Self::commit_columns_bit_reversed(&main_pool[..num_cols]) .ok_or(ProvingError::EmptyCommitment)?; + #[cfg(feature = "instruments")] + crate::instruments::accum_r1_main(main_lde_dur, t_sub.elapsed()); transcript.append_bytes(&root); @@ -483,8 +491,14 @@ pub trait IsStarkProver< { let num_cols = trace.num_main_columns; trace.extract_columns_main_into(main_pool); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); Self::expand_pool_to_lde::(main_pool, num_cols, domain, twiddles); + #[cfg(feature = "instruments")] + let main_lde_dur = t_sub.elapsed(); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let (precomputed_tree, precomputed_root) = Self::commit_columns_bit_reversed(&main_pool[..num_precomputed_cols]) .ok_or(ProvingError::EmptyCommitment)?; @@ -492,6 +506,8 @@ pub trait IsStarkProver< let (mult_tree, mult_root) = Self::commit_columns_bit_reversed(&main_pool[num_precomputed_cols..num_cols]) .ok_or(ProvingError::EmptyCommitment)?; + #[cfg(feature = "instruments")] + crate::instruments::accum_r1_main(main_lde_dur, t_sub.elapsed()); debug_assert_eq!( precomputed_root, precomputed_commitment, @@ -806,6 +822,8 @@ pub trait IsStarkProver< round_1_result.bus_public_inputs.as_ref(), trace_length, ); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let constraint_evaluations = evaluator.evaluate( air, &round_1_result.lde_trace, @@ -814,9 +832,13 @@ pub trait IsStarkProver< boundary_coefficients, &round_1_result.rap_challenges, ); + #[cfg(feature = "instruments")] + let constraints_dur = t_sub.elapsed(); let number_of_parts = air.composition_poly_degree_bound(trace_length) / trace_length; + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let lde_composition_poly_parts_evaluations = if number_of_parts == 2 { // Direct quotient decomposition: avoid full-size iFFT by algebraically // splitting H(x) = H₀(x²) + x·H₁(x²) using: @@ -846,12 +868,21 @@ pub trait IsStarkProver< }) .collect() }; + #[cfg(feature = "instruments")] + let fft_dur = t_sub.elapsed(); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let Some((composition_poly_merkle_tree, composition_poly_root)) = Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) else { return Err(ProvingError::EmptyCommitment); }; + #[cfg(feature = "instruments")] + let merkle_dur = t_sub.elapsed(); + + #[cfg(feature = "instruments")] + crate::instruments::store_r2_sub(constraints_dur, fft_dur, merkle_dur); Ok(Round2 { lde_composition_poly_evaluations: lde_composition_poly_parts_evaluations, @@ -974,6 +1005,8 @@ pub trait IsStarkProver< let gammas = deep_composition_coefficients; // Compute p₀ (deep composition polynomial) as N evaluations on trace-size coset + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let deep_evals = Self::compute_deep_composition_poly_evaluations( &round_1_result.lde_trace, round_2_result, @@ -984,18 +1017,26 @@ pub trait IsStarkProver< &gammas, &trace_term_coeffs, ); + #[cfg(feature = "instruments")] + let other_dur_1 = t_sub.elapsed(); // Extend N trace-coset evaluations to 2N LDE-coset evaluations via standard LDE. // deep_evals[i] = h(offset·ω_N^i) = f(ω_N^i) where f(x) = h(offset·x). // Standard iFFT+FFT recovers f and evaluates on the 2N-th roots: f(Ω^j) = h(offset·Ω^j). let domain_size = domain.lde_roots_of_unity_coset.len(); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let deep_poly = Polynomial::interpolate_fft::(&deep_evals).expect("iFFT should succeed"); let mut lde_evals = Polynomial::evaluate_fft::(&deep_poly, 1, Some(domain_size)) .expect("FFT should succeed"); in_place_bit_reverse_permute(&mut lde_evals); + #[cfg(feature = "instruments")] + let r4_fft_dur = t_sub.elapsed(); // FRI commit phase from pre-computed evaluations (no initial FFT) + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let (fri_last_value, fri_layers) = fri::commit_phase_from_evaluations::( domain.root_order as usize, @@ -1004,8 +1045,12 @@ pub trait IsStarkProver< &coset_offset, domain_size, ); + #[cfg(feature = "instruments")] + let r4_merkle_dur = t_sub.elapsed(); // grinding: generate nonce and append it to the transcript + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let security_bits = air.context().proof_options.grinding_factor; let mut nonce = None; if security_bits > 0 { @@ -1028,6 +1073,12 @@ pub trait IsStarkProver< let deep_poly_openings = Self::open_deep_composition_poly(domain, round_1_result, round_2_result, &iotas); + #[cfg(feature = "instruments")] + { + let queries_dur = t_sub.elapsed(); + crate::instruments::store_r4_sub(r4_fft_dur, r4_merkle_dur, other_dur_1, queries_dur); + } + Round4 { fri_last_value, fri_layers_merkle_roots, @@ -1408,6 +1459,9 @@ pub trait IsStarkProver< // Pre-pass: compute domains, twiddles, and max dimensions for pool allocation // ===================================================================== + #[cfg(feature = "instruments")] + let phase_start = Instant::now(); + let mut domains = Vec::with_capacity(num_airs); let mut twiddle_caches: Vec> = Vec::with_capacity(num_airs); let mut max_main_cols = 0usize; @@ -1437,12 +1491,18 @@ pub trait IsStarkProver< .map(|_| Vec::with_capacity(max_lde_size)) .collect(); + #[cfg(feature = "instruments")] + let prepass_elapsed = phase_start.elapsed(); + // ===================================================================== // Round 1, Phase A: Commit all main traces (lightweight) // ===================================================================== // All main trace commitments must be in the transcript before sampling // LogUp challenges. Pool buffers are reused across tables. + #[cfg(feature = "instruments")] + let phase_start = Instant::now(); + let mut main_commits: Vec> = Vec::with_capacity(num_airs); for ((air, trace, _pub_inputs), twiddles) in @@ -1473,6 +1533,9 @@ pub trait IsStarkProver< }); } + #[cfg(feature = "instruments")] + let main_commits_elapsed = phase_start.elapsed(); + // ===================================================================== // Round 1, Phase B: Sample shared LogUp challenges // ===================================================================== @@ -1499,6 +1562,9 @@ pub trait IsStarkProver< // Pass 1: Build aux traces in parallel. // Each build_auxiliary_trace has internal parallelism (batch_inverse, par_chunks), // but outer parallelism over 12 tables also helps on high-core-count machines. + #[cfg(feature = "instruments")] + let phase_start = Instant::now(); + #[cfg(feature = "parallel")] let aux_iter = air_trace_pairs.par_iter_mut(); #[cfg(not(feature = "parallel"))] @@ -1513,8 +1579,14 @@ pub trait IsStarkProver< }) .collect(); + #[cfg(feature = "instruments")] + let aux_build_elapsed = phase_start.elapsed(); + // Pass 2: Sequential fork transcript → extract → LDE → commit. // Uses shared aux_pool. Each table gets its own transcript fork. + #[cfg(feature = "instruments")] + let phase_start = Instant::now(); + let mut metadatas: Vec> = Vec::with_capacity(num_airs); let mut table_transcripts = Vec::with_capacity(num_airs); @@ -1537,15 +1609,23 @@ pub trait IsStarkProver< let (aux_tree, aux_root) = if air.has_aux_trace() { let num_aux_cols = trace.num_aux_columns; trace.extract_columns_aux_into(&mut aux_pool); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); Self::expand_pool_to_lde::( &mut aux_pool, num_aux_cols, domain, twiddles, ); + #[cfg(feature = "instruments")] + let aux_lde_dur = t_sub.elapsed(); + #[cfg(feature = "instruments")] + let t_sub = Instant::now(); let (tree, root) = Self::commit_columns_bit_reversed(&aux_pool[..num_aux_cols]) .ok_or(ProvingError::EmptyCommitment)?; + #[cfg(feature = "instruments")] + crate::instruments::accum_r1_aux(aux_lde_dur, t_sub.elapsed()); table_transcript.append_bytes(&root); (Some(Rc::new(tree)), Some(root)) @@ -1567,6 +1647,9 @@ pub trait IsStarkProver< table_transcripts.push(table_transcript); } + #[cfg(feature = "instruments")] + let aux_commit_elapsed = phase_start.elapsed(); + #[cfg(feature = "debug-checks")] Self::run_debug_checks( &air_trace_pairs, @@ -1583,6 +1666,12 @@ pub trait IsStarkProver< // For each table, recompute LDE into pool buffers, reuse stored Merkle trees, // run rounds 2-4 with the table's forked transcript, then drop table data. + #[cfg(feature = "instruments")] + let phase_start = Instant::now(); + #[cfg(feature = "instruments")] + let mut table_timings: Vec<(String, usize, std::time::Duration, crate::instruments::TableSubOps)> = + Vec::with_capacity(num_airs); + let mut proofs = Vec::with_capacity(num_airs); for (((((air, trace, pub_inputs), metadata), domain), twiddles), table_transcript) in air_trace_pairs @@ -1592,7 +1681,12 @@ pub trait IsStarkProver< .zip(twiddle_caches.iter()) .zip(table_transcripts.iter_mut()) { + #[cfg(feature = "instruments")] + let table_start = Instant::now(); + // Recompute LDE evaluations into pool, reuse stored Merkle trees + #[cfg(feature = "instruments")] + let lde_start = Instant::now(); let round_1_result = Self::reconstruct_round1( *air, *trace, @@ -1602,6 +1696,8 @@ pub trait IsStarkProver< &mut main_pool, &mut aux_pool, )?; + #[cfg(feature = "instruments")] + let lde_dur = lde_start.elapsed(); let proof = Self::prove_rounds_2_to_4( *air, @@ -1612,6 +1708,19 @@ pub trait IsStarkProver< )?; proofs.push(proof); + #[cfg(feature = "instruments")] + { + let mut sub_ops = crate::instruments::take_round_sub_ops() + .unwrap_or_default(); + sub_ops.trace_lde += lde_dur; + table_timings.push(( + air.name().to_string(), + trace.num_rows(), + table_start.elapsed(), + sub_ops, + )); + } + // Return column Vecs to pool (zero-copy move back). Pool slots that were // `take`n in reconstruct_round1 get their buffers back with capacity intact. let (main_cols, aux_cols) = round_1_result.lde_trace.into_columns(); @@ -1623,6 +1732,21 @@ pub trait IsStarkProver< } } + #[cfg(feature = "instruments")] + { + // Store timing data for the top-level report in prove_with_options. + // Uses a thread-local to avoid changing multi_prove's return type. + crate::instruments::store(crate::instruments::MultiProveTiming { + prepass: prepass_elapsed, + main_commits: main_commits_elapsed, + aux_build: aux_build_elapsed, + aux_commit: aux_commit_elapsed, + rounds_2_4: phase_start.elapsed(), + round1_sub: crate::instruments::take_r1_sub(), + table_timings, + }); + } + Ok(MultiProof::new(proofs)) } @@ -1665,11 +1789,6 @@ pub trait IsStarkProver< // ==========| Round 2 |========== // =================================== - #[cfg(feature = "instruments")] - println!("- Started round 2: Compute composition polynomial"); - #[cfg(feature = "instruments")] - let timer2 = Instant::now(); - // <<<< Receive challenge: 𝛽 let beta = transcript.sample_field_element(); let trace_length = domain.interpolation_domain_size; @@ -1706,26 +1825,18 @@ pub trait IsStarkProver< // >>>> Send commitments: [H₁], [H₂] transcript.append_bytes(&round_2_result.composition_poly_root); - #[cfg(feature = "instruments")] - let elapsed2 = timer2.elapsed(); - #[cfg(feature = "instruments")] - println!(" Time spent: {:?}", elapsed2); - // =================================== // ==========| Round 3 |========== // =================================== - #[cfg(feature = "instruments")] - println!("- Started round 3: Evaluate polynomial in out of domain elements"); - #[cfg(feature = "instruments")] - let timer3 = Instant::now(); - // <<<< Receive challenge: z let z = transcript.sample_z_ood( &domain.lde_roots_of_unity_coset, &domain.trace_roots_of_unity, ); + #[cfg(feature = "instruments")] + let t_r3 = Instant::now(); let round_3_result = Self::round_3_evaluate_polynomials_in_out_of_domain_element( air, domain, @@ -1733,6 +1844,8 @@ pub trait IsStarkProver< &round_2_result, &z, ); + #[cfg(feature = "instruments")] + let round_3_dur = t_r3.elapsed(); // >>>> Send values: tⱼ(zgᵏ) let trace_ood_evaluations_columns = round_3_result.trace_ood_evaluations.columns(); @@ -1747,20 +1860,10 @@ pub trait IsStarkProver< transcript.append_field_element(element); } - #[cfg(feature = "instruments")] - let elapsed3 = timer3.elapsed(); - #[cfg(feature = "instruments")] - println!(" Time spent: {:?}", elapsed3); - // =================================== // ==========| Round 4 |========== // =================================== - #[cfg(feature = "instruments")] - println!("- Started round 4: FRI"); - #[cfg(feature = "instruments")] - let timer4 = Instant::now(); - // Part of this round is running FRI, which is an interactive // protocol on its own. Therefore we pass it the transcript // to simulate the interactions with the verifier. @@ -1774,20 +1877,24 @@ pub trait IsStarkProver< transcript, ); - #[cfg(feature = "instruments")] - let elapsed4 = timer4.elapsed(); - #[cfg(feature = "instruments")] - println!(" Time spent: {:?}", elapsed4); - #[cfg(feature = "instruments")] { - let total_time = elapsed2 + elapsed3 + elapsed4; - println!( - " Fraction of proving time per round: {:.4} {:.4} {:.4}", - elapsed2.as_nanos() as f64 / total_time.as_nanos() as f64, - elapsed3.as_nanos() as f64 / total_time.as_nanos() as f64, - elapsed4.as_nanos() as f64 / total_time.as_nanos() as f64 - ); + let zero = std::time::Duration::ZERO; + let (r2_constraints, r2_fft, r2_merkle) = + crate::instruments::take_r2_sub().unwrap_or((zero, zero, zero)); + let (r4_fft, r4_merkle, r4_deep_comp, r4_queries) = + crate::instruments::take_r4_sub().unwrap_or((zero, zero, zero, zero)); + crate::instruments::store_round_sub_ops(crate::instruments::TableSubOps { + trace_lde: std::time::Duration::ZERO, // added by caller from lde_dur + constraints: r2_constraints, + comp_decompose: r2_fft, + comp_commit: r2_merkle, + ood: round_3_dur, + deep_comp: r4_deep_comp, + deep_extend: r4_fft, + fri_commit: r4_merkle, + queries: r4_queries, + }); } info!("End proof generation"); diff --git a/prover/Cargo.toml b/prover/Cargo.toml index 56189724d..dac711002 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -7,6 +7,7 @@ edition = "2024" default = ["parallel"] parallel = ["stark/parallel", "math/parallel", "crypto/parallel", "dep:rayon"] debug-checks = ["stark/debug-checks"] +instruments = ["stark/instruments"] [dependencies] stark = { path = "../crypto/stark" } @@ -23,3 +24,7 @@ criterion = { version = "0.5", default-features = false } [[bench]] name = "vm_prover_benchmark" harness = false + +[[bench]] +name = "profile_vm_prover" +harness = false diff --git a/prover/benches/profile_vm_prover.rs b/prover/benches/profile_vm_prover.rs index 87cb19f50..5ec78a1d3 100644 --- a/prover/benches/profile_vm_prover.rs +++ b/prover/benches/profile_vm_prover.rs @@ -3,13 +3,17 @@ // Run with: `samply record cargo bench --bench profile_vm_prover --features parallel` // Or with hyperfine: `hyperfine --runs 1 './target/release/deps/profile_vm_prover-*'` // -// Uses all_instructions_64.elf which exercises all supported RISC-V instructions. +// Default ELF: fib_iterative_372k (~372k steps, realistic workload). +// Override: cargo bench --bench profile_vm_prover --features parallel -- use lambda_vm_prover::test_utils::asm_elf_bytes; fn main() { - let elf_name = "all_instructions_64"; - let elf_bytes = asm_elf_bytes(elf_name); + let elf_name = std::env::args() + .skip(1) + .find(|a| !a.starts_with('-')) + .unwrap_or_else(|| "fib_iterative_372k".to_string()); + let elf_bytes = asm_elf_bytes(&elf_name); println!("Starting VM prover profiling..."); println!("Configuration:"); diff --git a/prover/src/instruments.rs b/prover/src/instruments.rs new file mode 100644 index 000000000..58954a919 --- /dev/null +++ b/prover/src/instruments.rs @@ -0,0 +1,204 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +fn fmt_rows(rows: usize) -> String { + if rows >= 1_000_000 { + format!("{:.1}M", rows as f64 / 1_000_000.0) + } else if rows >= 1_000 { + format!("{}K", rows / 1_000) + } else { + format!("{rows}") + } +} + +fn pct(dur: Duration, total: Duration) -> f64 { + if total > Duration::ZERO { + dur.as_secs_f64() / total.as_secs_f64() * 100.0 + } else { + 0.0 + } +} + +/// Top-level row: % in first column. +fn row_top(label: &str, dur: Duration, total: Duration) { + eprintln!( + " {:<36} {:>7.2}s {:>5.1}%", + label, + dur.as_secs_f64(), + pct(dur, total), + ); +} + +/// Sub-level row: % shifted right into its own column. +fn row_sub(label: &str, dur: Duration, total: Duration) { + eprintln!( + " {:<36} {:>7.2}s {:>5.1}%", + label, + dur.as_secs_f64(), + pct(dur, total), + ); +} + +/// Strip the `[N]` suffix to get the base table name. +fn base_name(name: &str) -> &str { + name.find('[').map_or(name, |i| &name[..i]) +} + +struct MergedTable { + total_dur: Duration, + total_rows: usize, + count: usize, + sub_ops: stark::instruments::TableSubOps, +} + +/// Print a unified timing report to stderr. +pub fn print_report( + execute: Duration, + trace_build: Duration, + air_construction: Duration, + _prove: Duration, + total: Duration, +) { + let mp = stark::instruments::take(); + + eprintln!(); + eprintln!("=== PROVER TIMING ==="); + eprintln!( + " {:<36} {:>8} {:>5}", + "Phase", "Wall", "%", + ); + eprintln!(" {}", "─".repeat(58)); + + row_top("Execute", execute, total); + row_top("Trace build", trace_build, total); + row_top("AIR construction", air_construction, total); + + if let Some(mp) = mp { + let round1 = mp.main_commits + mp.aux_build + mp.aux_commit; + + row_top("Pre-pass (domains/twiddles)", mp.prepass, total); + row_top("Round 1", round1, total); + row_sub(" Main trace commits", mp.main_commits, total); + row_sub(" expand_pool_to_lde", mp.round1_sub.main_lde, total); + row_sub(" commit (Merkle)", mp.round1_sub.main_merkle, total); + row_sub(" Aux trace build (parallel)", mp.aux_build, total); + row_sub(" Aux trace commit", mp.aux_commit, total); + row_sub(" expand_pool_to_lde", mp.round1_sub.aux_lde, total); + row_sub(" commit (Merkle)", mp.round1_sub.aux_merkle, total); + row_top("Rounds 2\u{2013}4", mp.rounds_2_4, total); + + // Merge split tables: MEMW[0..4] → MEMW x5 + let mut merged: BTreeMap = BTreeMap::new(); + for (name, rows, dur, sub_ops) in &mp.table_timings { + let base = base_name(name).to_string(); + let entry = merged.entry(base).or_insert(MergedTable { + total_dur: Duration::ZERO, + total_rows: 0, + count: 0, + sub_ops: stark::instruments::TableSubOps::default(), + }); + entry.total_dur += *dur; + entry.total_rows += rows; + entry.count += 1; + entry.sub_ops.trace_lde += sub_ops.trace_lde; + entry.sub_ops.constraints += sub_ops.constraints; + entry.sub_ops.comp_decompose += sub_ops.comp_decompose; + entry.sub_ops.comp_commit += sub_ops.comp_commit; + entry.sub_ops.ood += sub_ops.ood; + entry.sub_ops.deep_comp += sub_ops.deep_comp; + entry.sub_ops.deep_extend += sub_ops.deep_extend; + entry.sub_ops.fri_commit += sub_ops.fri_commit; + entry.sub_ops.queries += sub_ops.queries; + } + + let mut sorted: Vec<_> = merged.into_iter().collect(); + sorted.sort_by(|a, b| b.1.total_dur.cmp(&a.1.total_dur)); + + let threshold = total.as_secs_f64() * 0.02; + let mut others_dur = Duration::ZERO; + let mut others_count = 0usize; + + for (name, t) in &sorted { + if t.total_dur.as_secs_f64() >= threshold { + let display_name = if t.count > 1 { + format!("{name} x{}", t.count) + } else { + name.clone() + }; + let label = format!( + " {:<18} {:>6}", + display_name, + fmt_rows(t.total_rows), + ); + row_sub(&label, t.total_dur, total); + } else { + others_dur += t.total_dur; + others_count += 1; + } + } + if others_count > 0 { + let label = format!(" ({others_count} others)"); + row_sub(&label, others_dur, total); + } + + // Sub-operation totals across all tables + let mut total_trace_lde = Duration::ZERO; + let mut total_constraints = Duration::ZERO; + let mut total_comp_decompose = Duration::ZERO; + let mut total_comp_commit = Duration::ZERO; + let mut total_ood = Duration::ZERO; + let mut total_deep_comp = Duration::ZERO; + let mut total_deep_extend = Duration::ZERO; + let mut total_fri_commit = Duration::ZERO; + let mut total_queries = Duration::ZERO; + for (_, t) in &sorted { + total_trace_lde += t.sub_ops.trace_lde; + total_constraints += t.sub_ops.constraints; + total_comp_decompose += t.sub_ops.comp_decompose; + total_comp_commit += t.sub_ops.comp_commit; + total_ood += t.sub_ops.ood; + total_deep_comp += t.sub_ops.deep_comp; + total_deep_extend += t.sub_ops.deep_extend; + total_fri_commit += t.sub_ops.fri_commit; + total_queries += t.sub_ops.queries; + } + + let sub_ops_sum = total_trace_lde + total_constraints + total_comp_decompose + + total_comp_commit + total_ood + total_deep_comp + total_deep_extend + + total_fri_commit + total_queries; + if sub_ops_sum > Duration::ZERO { + let mut sub_ops: Vec<(&str, Duration)> = vec![ + ("R1 expand_pool_to_lde", total_trace_lde), + ("R2 evaluate", total_constraints), + ("R2 decompose_and_extend_d2", total_comp_decompose), + ("R2 commit_composition_poly", total_comp_commit), + ("R3 OOD evaluation", total_ood), + ("R4 deep_composition_poly_evals", total_deep_comp), + ("R4 interpolate+evaluate_fft", total_deep_extend), + ("R4 fri::commit_phase", total_fri_commit), + ("R4 queries & openings", total_queries), + ]; + sub_ops.sort_by(|a, b| b.1.cmp(&a.1)); + eprintln!( + " {}", + " \u{2500}\u{2500} sub-operation totals (all tables) \u{2500}\u{2500}", + ); + for (label, dur) in &sub_ops { + row_sub(&format!(" {label}"), *dur, total); + } + } + + // Cross-round totals: all FFT work and all Merkle work + let total_fft = mp.round1_sub.main_lde + mp.round1_sub.aux_lde + + total_trace_lde + total_comp_decompose + total_deep_extend; + let total_merkle = mp.round1_sub.main_merkle + mp.round1_sub.aux_merkle + + total_comp_commit + total_fri_commit; + eprintln!(); + eprintln!(" {:<36} {:>7.2}s {:>5.1}%", "Total FFT", total_fft.as_secs_f64(), pct(total_fft, total)); + eprintln!(" {:<36} {:>7.2}s {:>5.1}%", "Total Merkle", total_merkle.as_secs_f64(), pct(total_merkle, total)); + } + + eprintln!(" {}", "─".repeat(58)); + eprintln!(" {:<36} {:>7.2}s", "TOTAL", total.as_secs_f64()); + eprintln!(); +} diff --git a/prover/src/lib.rs b/prover/src/lib.rs index 7539d327a..01f51934e 100644 --- a/prover/src/lib.rs +++ b/prover/src/lib.rs @@ -13,6 +13,8 @@ pub mod constraints; #[cfg(feature = "debug-checks")] mod debug_report; +#[cfg(feature = "instruments")] +pub mod instruments; pub mod tables; pub mod test_utils; pub mod tests; @@ -342,15 +344,37 @@ pub fn prove_with_options( proof_options: &ProofOptions, max_rows: &MaxRowsConfig, ) -> Result { + #[cfg(feature = "instruments")] + let total_start = std::time::Instant::now(); + + // Phase 1: Execute (ELF load + run) + #[cfg(feature = "instruments")] + let phase_start = std::time::Instant::now(); + let program = Elf::load(elf_bytes).map_err(|e| Error::ElfLoad(format!("{e}")))?; let executor = Executor::new(&program, vec![]).map_err(|e| Error::Execution(format!("{e}")))?; let result = executor .run() .map_err(|e| Error::Execution(format!("{e}")))?; + #[cfg(feature = "instruments")] + let execute_elapsed = phase_start.elapsed(); + + // Phase 2: Trace build + #[cfg(feature = "instruments")] + let phase_start = std::time::Instant::now(); + // Generate all traces from ELF and execution logs. // Page tables are derived from the prover's MemoryState (all accessed pages). let mut traces = Traces::from_elf_and_logs(&program, &result.logs, max_rows)?; + + #[cfg(feature = "instruments")] + let trace_build_elapsed = phase_start.elapsed(); + + // Phase 3: AIR construction + #[cfg(feature = "instruments")] + let phase_start = std::time::Instant::now(); + let table_counts = traces.table_counts(); let airs = VmAirs::new( &program, @@ -360,14 +384,33 @@ pub fn prove_with_options( &table_counts, ); + #[cfg(feature = "instruments")] + let air_elapsed = phase_start.elapsed(); + let runtime_page_ranges = traces.runtime_page_ranges(); + // Phase 4: Prove (multi_prove) + #[cfg(feature = "instruments")] + let phase_start = std::time::Instant::now(); + let proof = Prover::multi_prove( airs.air_trace_pairs(&mut traces), &mut DefaultTranscript::::new(&[]), ) .map_err(|e| Error::Prover(format!("{e:?}")))?; + #[cfg(feature = "instruments")] + { + let prove_elapsed = phase_start.elapsed(); + instruments::print_report( + execute_elapsed, + trace_build_elapsed, + air_elapsed, + prove_elapsed, + total_start.elapsed(), + ); + } + Ok(VmProof { proof, runtime_page_ranges, From af6aab3a0e6f747356a81e176e75dc5662dfeb9a Mon Sep 17 00:00:00 2001 From: MauroFab Date: Sat, 28 Mar 2026 17:13:50 +0100 Subject: [PATCH 09/34] Fix syscall number --- bench_vs/lambda/fibonacci/src/main.rs | 2 +- bench_vs/run.sh | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bench_vs/lambda/fibonacci/src/main.rs b/bench_vs/lambda/fibonacci/src/main.rs index 8f54cf604..ff06237bc 100644 --- a/bench_vs/lambda/fibonacci/src/main.rs +++ b/bench_vs/lambda/fibonacci/src/main.rs @@ -16,7 +16,7 @@ fn halt(code: u64) -> ! { core::arch::asm!( "ecall", in("a0") code, - in("a7") 5u64, + in("a7") 93u64, options(noreturn), ); } diff --git a/bench_vs/run.sh b/bench_vs/run.sh index 4aa249a5d..8c3cb8179 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -107,7 +107,11 @@ run_one() { LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench" echo -e " ${GREEN}[Lambda VM] Proving...${NC}" - LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>/dev/null) + LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>"$TMP_DIR/lambda_err.txt") + if [ $? -ne 0 ]; then + echo -e " ${RED}[Lambda VM] FAILED:${NC}" + cat "$TMP_DIR/lambda_err.txt" + fi lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC}" fi From 687af82fa77e456f0d93c4fd40deb174fac6a72f Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 31 Mar 2026 19:49:01 -0300 Subject: [PATCH 10/34] Add runtime private inputs, nightly CI workflow, and 500M step projection to bench_vs --- .github/workflows/bench-vs-nightly.yml | 58 +++ bench_vs/lambda/fibonacci/build.rs | 10 - bench_vs/lambda/fibonacci/src/main.rs | 49 ++- bench_vs/run.sh | 481 +++++++++++++++++++++---- bin/cli/src/main.rs | 68 +++- crypto/stark/src/lib.rs | 2 - prover/src/lib.rs | 22 +- 7 files changed, 590 insertions(+), 100 deletions(-) create mode 100644 .github/workflows/bench-vs-nightly.yml delete mode 100644 bench_vs/lambda/fibonacci/build.rs diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml new file mode 100644 index 000000000..50e88bb63 --- /dev/null +++ b/.github/workflows/bench-vs-nightly.yml @@ -0,0 +1,58 @@ +name: Bench Vs Nightly + +on: + schedule: + # 03:00 America/Argentina/Buenos_Aires = 06:00 UTC + - cron: "0 6 * * *" + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: bench-vs-nightly-${{ github.ref }} + cancel-in-progress: true + +jobs: + bench-vs: + runs-on: [self-hosted, bench] + timeout-minutes: 720 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Rust Environment + uses: ./.github/actions/setup-rust + + - name: Add cargo to PATH + run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + + - name: Add SP1 to PATH + run: echo "$HOME/.sp1/bin" >> "$GITHUB_PATH" + + - name: Install SP1 toolchain + run: | + export PATH="$HOME/.cargo/bin:$HOME/.sp1/bin:$PATH" + if ! cargo prove --version >/dev/null 2>&1; then + curl -L https://sp1up.succinct.xyz | bash + export PATH="$HOME/.sp1/bin:$PATH" + sp1up + fi + cargo prove --version + + - name: Run nightly benchmark + run: | + bash ./bench_vs/run.sh \ + -n 1000000 2000000 4000000 8000000 \ + --report-dir bench_vs_artifacts \ + --no-color + + - name: Upload nightly benchmark artifact + uses: actions/upload-artifact@v4 + with: + name: bench-vs-nightly-${{ github.sha }} + path: bench_vs_artifacts + retention-days: 90 + + - name: Publish summary + run: cat bench_vs_artifacts/summary.md >> "$GITHUB_STEP_SUMMARY" diff --git a/bench_vs/lambda/fibonacci/build.rs b/bench_vs/lambda/fibonacci/build.rs deleted file mode 100644 index 5c189eadb..000000000 --- a/bench_vs/lambda/fibonacci/build.rs +++ /dev/null @@ -1,10 +0,0 @@ -use std::env; -use std::fs; -use std::path::Path; - -fn main() { - let n = env::var("BENCH_N").unwrap_or_else(|_| "1000".to_string()); - let out_dir = env::var("OUT_DIR").unwrap(); - fs::write(Path::new(&out_dir).join("n.txt"), &n).unwrap(); - println!("cargo:rerun-if-env-changed=BENCH_N"); -} diff --git a/bench_vs/lambda/fibonacci/src/main.rs b/bench_vs/lambda/fibonacci/src/main.rs index ff06237bc..e9e673e0c 100644 --- a/bench_vs/lambda/fibonacci/src/main.rs +++ b/bench_vs/lambda/fibonacci/src/main.rs @@ -1,22 +1,52 @@ #![no_std] #![no_main] +use core::arch::asm; use core::panic::PanicInfo; +const SYSCALL_GET_PRIVATE_INPUTS: u64 = 4; +const SYSCALL_COMMIT: u64 = 64; +const SYSCALL_HALT: u64 = 93; + #[panic_handler] fn panic(_info: &PanicInfo) -> ! { loop {} } -const N: u64 = include!(concat!(env!("OUT_DIR"), "/n.txt")); +fn read_n() -> u64 { + let mut input = [0u8; 12]; + + unsafe { + asm!( + "ecall", + in("a0") input.as_mut_ptr(), + in("a7") SYSCALL_GET_PRIVATE_INPUTS, + ); + } + + let mut n_bytes = [0u8; 8]; + n_bytes.copy_from_slice(&input[4..12]); + u64::from_le_bytes(n_bytes) +} -#[inline(never)] -fn halt(code: u64) -> ! { +fn commit(bytes: &[u8]) { unsafe { - core::arch::asm!( + asm!( "ecall", - in("a0") code, - in("a7") 93u64, + in("a0") 1u64, + in("a1") bytes.as_ptr(), + in("a2") bytes.len(), + in("a7") SYSCALL_COMMIT, + ); + } +} + +fn halt() -> ! { + unsafe { + asm!( + "ecall", + in("a0") 0u64, + in("a7") SYSCALL_HALT, options(noreturn), ); } @@ -24,12 +54,15 @@ fn halt(code: u64) -> ! { #[unsafe(no_mangle)] pub fn main() -> ! { + let n = read_n(); let mut a: u64 = 0; let mut b: u64 = 1; - for _ in 0..N { + for _ in 0..n { let c = a.wrapping_add(b); a = b; b = c; } - halt(b) + + commit(&b.to_le_bytes()); + halt() } diff --git a/bench_vs/run.sh b/bench_vs/run.sh index 8c3cb8179..5592e095c 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -2,20 +2,23 @@ # Benchmark: Lambda VM vs SP1 v6 — Fibonacci proving time comparison. # # Usage: ./bench_vs/run.sh [-n 1000 50000 100000] [--lambda-only | --sp1-only] +# [--report-dir DIR] [--no-color] # # Without -n, runs the default series: 1000 10000 100000 300000 -# With -n, runs the specified values (space-separated): -n 1000 50000 # # Prerequisites: -# - Lambda VM CLI built: cargo build --release -p cli -# - SP1 toolchain installed: curl -L https://sp1up.succinct.xyz | bash && sp1up -# - Rust nightly toolchain: rustup toolchain install nightly +# - Lambda VM CLI build dependencies available +# - SP1 toolchain installed (or available in PATH for CI) +# - Rust stable + nightly-2026-02-01 installed set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="/tmp/bench_fib" +REPORT_DIR="" +NO_COLOR=false +TARGET_STEPS=500000000 RED='\033[0;31m' GREEN='\033[0;32m' @@ -23,31 +26,53 @@ YELLOW='\033[1;33m' BOLD='\033[1m' NC='\033[0m' -# --- Defaults ---------------------------------------------------------------- +# --- Defaults --------------------------------------------------------------- DEFAULT_SERIES=(1000 10000 100000 300000) SERIES=() RUN_LAMBDA=true RUN_SP1=true -# --- Parse args -------------------------------------------------------------- +# --- Parse args ------------------------------------------------------------- while [[ $# -gt 0 ]]; do case $1 in - -n) shift + -n) + shift while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do - SERIES+=("$1"); shift - done ;; - --lambda-only) RUN_SP1=false; shift ;; - --sp1-only) RUN_LAMBDA=false; shift ;; + SERIES+=("$1") + shift + done + ;; + --lambda-only) + RUN_SP1=false + shift + ;; + --sp1-only) + RUN_LAMBDA=false + shift + ;; + --report-dir) + REPORT_DIR=$2 + shift 2 + ;; + --no-color) + NO_COLOR=true + shift + ;; -h|--help) - echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only]" + echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--no-color]" echo "" - echo " -n N1 N2 ... Fibonacci iteration counts (space-separated)" - echo " Default series: ${DEFAULT_SERIES[*]}" - echo " --lambda-only Only run Lambda VM benchmark" - echo " --sp1-only Only run SP1 benchmark" + echo " -n N1 N2 ... Fibonacci iteration counts (space-separated)" + echo " Default series: ${DEFAULT_SERIES[*]}" + echo " --lambda-only Only run Lambda VM benchmark" + echo " --sp1-only Only run SP1 benchmark" + echo " --report-dir DIR Write TSV, metrics, markdown summary, and raw outputs" + echo " --no-color Disable ANSI colors" exit 0 ;; - *) echo "Unknown option: $1"; exit 1 ;; + *) + echo "Unknown option: $1" + exit 1 + ;; esac done @@ -55,21 +80,152 @@ if [ ${#SERIES[@]} -eq 0 ]; then SERIES=("${DEFAULT_SERIES[@]}") fi +if ! $RUN_LAMBDA && ! $RUN_SP1; then + echo "At least one prover must be enabled" + exit 1 +fi + +if $NO_COLOR; then + RED='' + GREEN='' + YELLOW='' + BOLD='' + NC='' +fi + +mkdir -p "$TMP_DIR" +rm -rf "$TMP_DIR"/* + +if [ -n "$REPORT_DIR" ]; then + mkdir -p "$REPORT_DIR/raw" +fi + +join_slash() { + local joined="" + local value + for value in "$@"; do + joined="${joined:+$joined/}$value" + done + printf "%s\n" "$joined" +} + +fit_series() { + local steps_slash=$1 + local values_slash=$2 + + awk -v steps="$steps_slash" -v values="$values_slash" 'BEGIN { + n = split(steps, xs, "/") + m = split(values, ys, "/") + if (n == 0 || n != m) { + print "0 0 0.0000" + exit + } + + sx = 0; sy = 0; sxy = 0; sx2 = 0 + for (i = 1; i <= n; i++) { + x = xs[i] / 1000000 + y = ys[i] + 0 + sx += x + sy += y + sxy += x * y + sx2 += x * x + } + + d = n * sx2 - sx * sx + if (d == 0) { + intercept = sy / n + printf "0 %.6f 0.0000\n", intercept + exit + } + + slope = (n * sxy - sx * sy) / d + intercept = (sy - slope * sx) / n + + my = sy / n + ss_tot = 0 + ss_res = 0 + for (i = 1; i <= n; i++) { + x = xs[i] / 1000000 + y = ys[i] + 0 + pred = slope * x + intercept + ss_res += (y - pred) * (y - pred) + ss_tot += (y - my) * (y - my) + } + + r2 = (ss_tot > 0) ? 1 - ss_res / ss_tot : 0 + if (r2 < 0) { + r2 = 0 + } + + printf "%.6f %.6f %.4f\n", slope, intercept, r2 + }' +} + +project_series() { + local slope=$1 + local intercept=$2 + local target_steps=$3 + + awk -v slope="$slope" -v intercept="$intercept" -v target="$target_steps" 'BEGIN { + projected = slope * (target / 1000000) + intercept + if (projected < 0) { + projected = 0 + } + printf "%.3f\n", projected + }' +} + +format_hours() { + local seconds=$1 + awk -v value="$seconds" 'BEGIN { printf "%.2f\n", value / 3600 }' +} + +write_u64_le() { + local value=$1 + local output_path=$2 + + python3 - "$value" "$output_path" <<'PY' +import struct +import sys + +value = int(sys.argv[1]) +path = sys.argv[2] + +with open(path, "wb") as fh: + fh.write(struct.pack("&1 | tail -1 +if $RUN_LAMBDA; then + echo -e "${GREEN}[Lambda VM] Building CLI...${NC}" + cargo build --release -p cli --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -5 +fi + +if $RUN_LAMBDA; then + echo -e "${GREEN}[Lambda VM] Building fibonacci prover...${NC}" + ( + cd "$LAMBDA_DIR" && \ + cargo +nightly-2026-02-01 build --release \ + --target "$TARGET_SPEC" \ + -Z build-std=core \ + -Z build-std-features=compiler-builtins-mem \ + -Z json-target-spec 2>&1 | tail -5 + ) + if [ ! -f "$LAMBDA_ELF" ]; then + echo -e "${RED}[Lambda VM] Build failed — fibonacci-bench ELF not found${NC}" + exit 1 + fi fi SP1_BIN="" @@ -84,101 +240,284 @@ if $RUN_SP1; then fi fi -# --- Run one benchmark -------------------------------------------------------- +# --- Run benchmark series --------------------------------------------------- + +RESULT_N=() +RESULT_LAMBDA=() +RESULT_SP1=() +RESULT_SP1_CYCLES=() +RESULT_RATIO=() -# Arrays to collect results for the summary table -declare -a RESULT_N RESULT_LAMBDA RESULT_SP1 +LAMBDA_STEPS=() +LAMBDA_TIMES=() +SP1_STEPS=() +SP1_TIMES=() + +if [ -n "$REPORT_DIR" ]; then + printf "n\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv" +fi run_one() { - local N=$1 - echo "" - echo -e "${BOLD}--- n=${N} ---${NC}" + local n=$1 + local lambda_time="n/a" + local sp1_time="n/a" + local sp1_cycles="n/a" + local ratio="n/a" - local lambda_time="" - local sp1_time="" - local sp1_cycles="" + echo "" + echo -e "${BOLD}--- n=${n} ---${NC}" if $RUN_LAMBDA; then - echo -e " ${GREEN}[Lambda VM] Building (n=${N})...${NC}" - (cd "$LAMBDA_DIR" && BENCH_N="$N" cargo +nightly build --release \ - --target "$TARGET_SPEC" \ - -Z build-std=core -Z build-std-features=compiler-builtins-mem 2>&1 \ - -Z json-target-spec 2>&1 | tail -1) - LAMBDA_ELF="$LAMBDA_DIR/target/riscv64im-lambda-vm-elf/release/fibonacci-bench" + local input_file="$TMP_DIR/lambda_${n}.bin" + local proof_file="$TMP_DIR/lambda_${n}.proof" + local stderr_file="$TMP_DIR/lambda_${n}.stderr" + write_u64_le "$n" "$input_file" echo -e " ${GREEN}[Lambda VM] Proving...${NC}" - LAMBDA_OUTPUT=$("$CLI" prove "$LAMBDA_ELF" -o "$TMP_DIR/lambda_proof.bin" --time 2>"$TMP_DIR/lambda_err.txt") - if [ $? -ne 0 ]; then + local lambda_output + if ! lambda_output=$("$CLI" prove "$LAMBDA_ELF" -o "$proof_file" --private-input "$input_file" --time 2>"$stderr_file"); then echo -e " ${RED}[Lambda VM] FAILED:${NC}" - cat "$TMP_DIR/lambda_err.txt" + cat "$stderr_file" + exit 1 fi - lambda_time=$(echo "$LAMBDA_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') + rm -f "$proof_file" + + lambda_time=$(echo "$lambda_output" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') + if [ -z "$lambda_time" ]; then + echo -e " ${RED}[Lambda VM] FAILED: could not parse proving time${NC}" + printf "%s\n" "$lambda_output" + exit 1 + fi + echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC}" + LAMBDA_STEPS+=("$n") + LAMBDA_TIMES+=("$lambda_time") + + if [ -n "$REPORT_DIR" ]; then + printf "%s\n" "$lambda_output" > "$REPORT_DIR/raw/lambda_${n}.stdout" + cp "$stderr_file" "$REPORT_DIR/raw/lambda_${n}.stderr" + fi fi if $RUN_SP1; then echo -e " ${GREEN}[SP1 v6] Proving...${NC}" - SP1_OUTPUT=$("$SP1_BIN" "$N" 2>/dev/null) - sp1_time=$(echo "$SP1_OUTPUT" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') - sp1_cycles=$(echo "$SP1_OUTPUT" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*') + local sp1_output_file="$TMP_DIR/sp1_${n}.stdout" + if ! "$SP1_BIN" "$n" > "$sp1_output_file" 2>&1; then + echo -e " ${RED}[SP1 v6] FAILED:${NC}" + cat "$sp1_output_file" + exit 1 + fi + + sp1_time=$(grep -o 'Proving time: [0-9.]*s' "$sp1_output_file" | grep -o '[0-9.]*') + sp1_cycles=$(grep -o 'Cycles: [0-9]*' "$sp1_output_file" | grep -o '[0-9]*') + if [ -z "$sp1_time" ] || [ -z "$sp1_cycles" ]; then + echo -e " ${RED}[SP1 v6] FAILED: could not parse output${NC}" + cat "$sp1_output_file" + exit 1 + fi + echo -e " SP1 v6: ${BOLD}${sp1_time}s${NC} (${sp1_cycles} cycles)" + SP1_STEPS+=("$n") + SP1_TIMES+=("$sp1_time") + + if [ -n "$REPORT_DIR" ]; then + cp "$sp1_output_file" "$REPORT_DIR/raw/sp1_${n}.stdout" + fi fi - RESULT_N+=("$N") - RESULT_LAMBDA+=("${lambda_time:-n/a}") - RESULT_SP1+=("${sp1_time:-n/a}") -} + if [ "$lambda_time" != "n/a" ] && [ "$sp1_time" != "n/a" ]; then + ratio=$(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { printf "%.3f", lambda / sp1 }') + fi + + RESULT_N+=("$n") + RESULT_LAMBDA+=("$lambda_time") + RESULT_SP1+=("$sp1_time") + RESULT_SP1_CYCLES+=("$sp1_cycles") + RESULT_RATIO+=("$ratio") -# --- Run series --------------------------------------------------------------- + if [ -n "$REPORT_DIR" ]; then + printf "%s\t%s\t%s\t%s\t%s\n" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" "$ratio" >> "$REPORT_DIR/results.tsv" + fi +} -for N in "${SERIES[@]}"; do - run_one "$N" +for n in "${SERIES[@]}"; do + run_one "$n" done -# --- Summary table ------------------------------------------------------------ +# --- Projection ------------------------------------------------------------- + +LAMBDA_SLOPE="" +LAMBDA_INTERCEPT="" +LAMBDA_R2="" +LAMBDA_PROJECTED_S="" +LAMBDA_PROJECTED_H="" + +SP1_SLOPE="" +SP1_INTERCEPT="" +SP1_R2="" +SP1_PROJECTED_S="" +SP1_PROJECTED_H="" + +compute_projection() { + local label=$1 + local steps_slash=$2 + local times_slash=$3 + local slope intercept r2 projected_s projected_h + + if [ -z "$steps_slash" ] || [ -z "$times_slash" ]; then + return 0 + fi + + read -r slope intercept r2 <<< "$(fit_series "$steps_slash" "$times_slash")" + projected_s=$(project_series "$slope" "$intercept" "$TARGET_STEPS") + projected_h=$(format_hours "$projected_s") + + case "$label" in + lambda) + LAMBDA_SLOPE=$slope + LAMBDA_INTERCEPT=$intercept + LAMBDA_R2=$r2 + LAMBDA_PROJECTED_S=$projected_s + LAMBDA_PROJECTED_H=$projected_h + ;; + sp1) + SP1_SLOPE=$slope + SP1_INTERCEPT=$intercept + SP1_R2=$r2 + SP1_PROJECTED_S=$projected_s + SP1_PROJECTED_H=$projected_h + ;; + esac +} + +if $RUN_LAMBDA && [ ${#LAMBDA_STEPS[@]} -gt 0 ]; then + compute_projection "lambda" "$(join_slash "${LAMBDA_STEPS[@]}")" "$(join_slash "${LAMBDA_TIMES[@]}")" +fi +if $RUN_SP1 && [ ${#SP1_STEPS[@]} -gt 0 ]; then + compute_projection "sp1" "$(join_slash "${SP1_STEPS[@]}")" "$(join_slash "${SP1_TIMES[@]}")" +fi + +# --- Summary table ---------------------------------------------------------- echo "" echo -e "${BOLD}=== Summary ===${NC}" echo -e "Program: Fibonacci (u64 wrapping)" echo "" -# Header if $RUN_LAMBDA && $RUN_SP1; then - printf " %-10s %12s %12s %8s\n" "n" "Lambda VM" "SP1 v6" "Ratio" - printf " %-10s %12s %12s %8s\n" "---" "---------" "------" "-----" + printf " %-10s %12s %12s %12s %8s\n" "n" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio" + printf " %-10s %12s %12s %12s %8s\n" "---" "---------" "------" "----------" "-----" elif $RUN_LAMBDA; then printf " %-10s %12s\n" "n" "Lambda VM" printf " %-10s %12s\n" "---" "---------" else - printf " %-10s %12s\n" "n" "SP1 v6" - printf " %-10s %12s\n" "---" "------" + printf " %-10s %12s %12s\n" "n" "SP1 v6" "SP1 cycles" + printf " %-10s %12s %12s\n" "---" "------" "----------" fi for i in "${!RESULT_N[@]}"; do n="${RESULT_N[$i]}" - lt="${RESULT_LAMBDA[$i]}" - st="${RESULT_SP1[$i]}" + lambda_time="${RESULT_LAMBDA[$i]}" + sp1_time="${RESULT_SP1[$i]}" + sp1_cycles="${RESULT_SP1_CYCLES[$i]}" + ratio="${RESULT_RATIO[$i]}" if $RUN_LAMBDA && $RUN_SP1; then - if [ "$lt" != "n/a" ] && [ "$st" != "n/a" ]; then - RATIO=$(LC_NUMERIC=C awk "BEGIN {printf \"%.1fx\", $lt / $st}") - if (( $(LC_NUMERIC=C awk "BEGIN {print ($lt > $st)}") )); then - RATIO="${RED}${RATIO}${NC}" + if [ "$ratio" != "n/a" ]; then + ratio_colored=$(LC_NUMERIC=C awk -v ratio="$ratio" 'BEGIN { printf "%.1fx", ratio }') + if (( $(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { print (lambda > sp1) }') )); then + ratio_colored="${RED}${ratio_colored}${NC}" else - RATIO="${GREEN}${RATIO}${NC}" + ratio_colored="${GREEN}${ratio_colored}${NC}" fi - printf " %-10s %11ss %11ss " "$n" "$lt" "$st" - echo -e "$RATIO" + printf " %-10s %11ss %11ss %12s " "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" + echo -e "$ratio_colored" else - printf " %-10s %12s %12s %8s\n" "$n" "${lt}s" "${st}s" "-" + printf " %-10s %12s %12s %12s %8s\n" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-" fi elif $RUN_LAMBDA; then - printf " %-10s %11ss\n" "$n" "$lt" + printf " %-10s %11ss\n" "$n" "$lambda_time" else - printf " %-10s %11ss\n" "$n" "$st" + printf " %-10s %11ss %12s\n" "$n" "$sp1_time" "$sp1_cycles" fi done echo "" -echo -e "Green ratio = Lambda VM faster, Red = SP1 faster" +if $RUN_LAMBDA && $RUN_SP1; then + echo -e "Green ratio = Lambda VM faster, Red = SP1 faster" +fi echo "Raw data in $TMP_DIR/" + +if [ -n "$LAMBDA_PROJECTED_S" ] || [ -n "$SP1_PROJECTED_S" ]; then + echo "" + echo -e "${BOLD}=== Linear Projection to 500M Steps ===${NC}" + if [ -n "$LAMBDA_PROJECTED_S" ]; then + echo " Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R²=${LAMBDA_R2}" + fi + if [ -n "$SP1_PROJECTED_S" ]; then + echo " SP1 v6: ${SP1_PROJECTED_S}s (${SP1_PROJECTED_H}h), R²=${SP1_R2}" + fi +fi + +# --- Machine-readable report ------------------------------------------------ + +if [ -n "$REPORT_DIR" ]; then + { + echo "target_steps=$TARGET_STEPS" + echo "series=$(join_slash "${RESULT_N[@]}")" + echo "lambda_times=$(join_slash "${RESULT_LAMBDA[@]}")" + echo "sp1_times=$(join_slash "${RESULT_SP1[@]}")" + echo "sp1_cycles=$(join_slash "${RESULT_SP1_CYCLES[@]}")" + echo "ratios=$(join_slash "${RESULT_RATIO[@]}")" + if [ -n "$LAMBDA_PROJECTED_S" ]; then + echo "lambda_slope_s_per_1m=$LAMBDA_SLOPE" + echo "lambda_intercept_s=$LAMBDA_INTERCEPT" + echo "lambda_r2=$LAMBDA_R2" + echo "lambda_projected_time_s=$LAMBDA_PROJECTED_S" + echo "lambda_projected_time_h=$LAMBDA_PROJECTED_H" + fi + if [ -n "$SP1_PROJECTED_S" ]; then + echo "sp1_slope_s_per_1m=$SP1_SLOPE" + echo "sp1_intercept_s=$SP1_INTERCEPT" + echo "sp1_r2=$SP1_R2" + echo "sp1_projected_time_s=$SP1_PROJECTED_S" + echo "sp1_projected_time_h=$SP1_PROJECTED_H" + fi + } > "$REPORT_DIR/metrics.txt" + + { + echo "# Lambda VM vs SP1 v6 Benchmark" + echo + echo "| n | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |" + echo "|--:|--------------:|-----------:|-----------:|------:|" + for i in "${!RESULT_N[@]}"; do + printf "| %s | %s | %s | %s | %s |\n" \ + "${RESULT_N[$i]}" \ + "${RESULT_LAMBDA[$i]}" \ + "${RESULT_SP1[$i]}" \ + "${RESULT_SP1_CYCLES[$i]}" \ + "${RESULT_RATIO[$i]}" + done + echo + echo "## Linear Projection to 500M Steps" + echo + echo "| Prover | Slope (s / 1M steps) | Intercept (s) | R² | Projected @ 500M (s) | Projected @ 500M (h) |" + echo "|--------|----------------------:|--------------:|---:|---------------------:|---------------------:|" + if [ -n "$LAMBDA_PROJECTED_S" ]; then + printf "| Lambda VM | %s | %s | %s | %s | %s |\n" \ + "$LAMBDA_SLOPE" \ + "$LAMBDA_INTERCEPT" \ + "$LAMBDA_R2" \ + "$LAMBDA_PROJECTED_S" \ + "$LAMBDA_PROJECTED_H" + fi + if [ -n "$SP1_PROJECTED_S" ]; then + printf "| SP1 v6 | %s | %s | %s | %s | %s |\n" \ + "$SP1_SLOPE" \ + "$SP1_INTERCEPT" \ + "$SP1_R2" \ + "$SP1_PROJECTED_S" \ + "$SP1_PROJECTED_H" + fi + } > "$REPORT_DIR/summary.md" +fi diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs index 725f0de5f..3a1917a32 100644 --- a/bin/cli/src/main.rs +++ b/bin/cli/src/main.rs @@ -101,6 +101,10 @@ enum Commands { #[arg(value_parser, value_hint = ValueHint::FilePath)] elf: PathBuf, + /// Path to the private input file + #[arg(long, value_hint = ValueHint::FilePath)] + private_input: Option, + /// Generate flamegraph folded stacks to file #[arg(long, value_hint = ValueHint::FilePath)] flamegraph: Option, @@ -116,6 +120,10 @@ enum Commands { #[arg(short, long, value_hint = ValueHint::FilePath)] output: PathBuf, + /// Path to the private input file + #[arg(long, value_hint = ValueHint::FilePath)] + private_input: Option, + /// Blowup factor (power of 2). Higher = fewer queries, smaller proof, slower proving. #[arg(long)] blowup: Option, @@ -149,13 +157,18 @@ fn main() -> ExitCode { let cli = Cli::parse(); match cli.command { - Commands::Execute { elf, flamegraph } => cmd_execute(elf, flamegraph), + Commands::Execute { + elf, + private_input, + flamegraph, + } => cmd_execute(elf, private_input, flamegraph), Commands::Prove { elf, output, + private_input, blowup, time, - } => cmd_prove(elf, output, blowup, time), + } => cmd_prove(elf, output, private_input, blowup, time), Commands::Verify { proof, elf, @@ -165,7 +178,21 @@ fn main() -> ExitCode { } } -fn cmd_execute(elf_path: PathBuf, flamegraph_path: Option) -> ExitCode { +fn read_private_input(path: Option<&PathBuf>) -> Result, String> { + match path { + Some(path) => { + eprintln!("Reading private input file..."); + std::fs::read(path).map_err(|e| format!("Failed to read private input file: {e}")) + } + None => Ok(vec![]), + } +} + +fn cmd_execute( + elf_path: PathBuf, + private_input_path: Option, + flamegraph_path: Option, +) -> ExitCode { let elf_data = match std::fs::read(&elf_path) { Ok(data) => data, Err(e) => { @@ -182,7 +209,15 @@ fn cmd_execute(elf_path: PathBuf, flamegraph_path: Option) -> ExitCode } }; - let mut executor = match Executor::new(&program, vec![]) { + let private_inputs = match read_private_input(private_input_path.as_ref()) { + Ok(inputs) => inputs, + Err(e) => { + eprintln!("{e}"); + return ExitCode::FAILURE; + } + }; + + let mut executor = match Executor::new(&program, private_inputs) { Ok(e) => e, Err(e) => { eprintln!("Failed to create executor: {:?}", e); @@ -249,7 +284,13 @@ fn cmd_execute(elf_path: PathBuf, flamegraph_path: Option) -> ExitCode ExitCode::SUCCESS } -fn cmd_prove(elf_path: PathBuf, output_path: PathBuf, blowup: Option, time: bool) -> ExitCode { +fn cmd_prove( + elf_path: PathBuf, + output_path: PathBuf, + private_input_path: Option, + blowup: Option, + time: bool, +) -> ExitCode { eprintln!("Reading ELF file..."); let elf_data = match std::fs::read(&elf_path) { Ok(data) => data, @@ -259,6 +300,14 @@ fn cmd_prove(elf_path: PathBuf, output_path: PathBuf, blowup: Option, time: } }; + let private_inputs = match read_private_input(private_input_path.as_ref()) { + Ok(inputs) => inputs, + Err(e) => { + eprintln!("{e}"); + return ExitCode::FAILURE; + } + }; + #[cfg(feature = "jemalloc-stats")] let tracker = heap_tracker::HeapTracker::start(); @@ -276,11 +325,16 @@ fn cmd_prove(elf_path: PathBuf, output_path: PathBuf, blowup: Option, time: "Generating proof (blowup={b}, queries={})...", opts.fri_number_of_queries ); - prover::prove_with_options(&elf_data, &opts, &Default::default()) + prover::prove_with_options_and_inputs( + &elf_data, + &private_inputs, + &opts, + &Default::default(), + ) } None => { eprintln!("Generating proof..."); - prover::prove(&elf_data) + prover::prove_with_inputs(&elf_data, &private_inputs) } }; let prove_elapsed = start.elapsed(); diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs index 6cfab6ea3..41089a9e5 100644 --- a/crypto/stark/src/lib.rs +++ b/crypto/stark/src/lib.rs @@ -1,8 +1,6 @@ #[cfg(feature = "debug-checks")] pub mod bus_debug; pub mod constraints; -#[cfg(feature = "instruments")] -pub mod instruments; pub mod context; pub mod debug; pub mod domain; diff --git a/prover/src/lib.rs b/prover/src/lib.rs index 21bfc9255..a4eb6cd7a 100644 --- a/prover/src/lib.rs +++ b/prover/src/lib.rs @@ -461,8 +461,14 @@ pub(crate) fn compute_expected_commit_bus_balance( /// Prove an ELF binary execution. Returns a serializable proof bundle. pub fn prove(elf_bytes: &[u8]) -> Result { - prove_with_options( + prove_with_inputs(elf_bytes, &[]) +} + +/// Prove an ELF binary execution with private inputs. Returns a serializable proof bundle. +pub fn prove_with_inputs(elf_bytes: &[u8], private_inputs: &[u8]) -> Result { + prove_with_options_and_inputs( elf_bytes, + private_inputs, &GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 is always valid"), &MaxRowsConfig::default(), ) @@ -473,6 +479,17 @@ pub fn prove_with_options( elf_bytes: &[u8], proof_options: &ProofOptions, max_rows: &MaxRowsConfig, +) -> Result { + prove_with_options_and_inputs(elf_bytes, &[], proof_options, max_rows) +} + +/// Prove an ELF binary execution with custom proof options, max rows config, +/// and explicit private inputs. +pub fn prove_with_options_and_inputs( + elf_bytes: &[u8], + private_inputs: &[u8], + proof_options: &ProofOptions, + max_rows: &MaxRowsConfig, ) -> Result { #[cfg(feature = "instruments")] let total_start = std::time::Instant::now(); @@ -482,7 +499,8 @@ pub fn prove_with_options( let phase_start = std::time::Instant::now(); let program = Elf::load(elf_bytes).map_err(|e| Error::ElfLoad(format!("{e}")))?; - let executor = Executor::new(&program, vec![]).map_err(|e| Error::Execution(format!("{e}")))?; + let executor = Executor::new(&program, private_inputs.to_vec()) + .map_err(|e| Error::Execution(format!("{e}")))?; let result = executor .run() .map_err(|e| Error::Execution(format!("{e}")))?; From 0315a80dbd6c5678d761b79cf11485dbc2c0e4e0 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Mon, 6 Apr 2026 11:36:46 -0300 Subject: [PATCH 11/34] save work --- .github/workflows/bench-vs-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml index 50e88bb63..eb063c5ae 100644 --- a/.github/workflows/bench-vs-nightly.yml +++ b/.github/workflows/bench-vs-nightly.yml @@ -43,7 +43,7 @@ jobs: - name: Run nightly benchmark run: | bash ./bench_vs/run.sh \ - -n 1000000 2000000 4000000 8000000 \ + -n 500000 1000000 1500000 2000000 \ --report-dir bench_vs_artifacts \ --no-color From 3f3c7aca2634f9bd85b2dc397d08a6b03ab672ef Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 7 Apr 2026 15:47:57 -0300 Subject: [PATCH 12/34] Fix bench_vs 5x projection inflation and add --steps flag for nightly 1M/2M/4M/8M benchmarks --- .github/workflows/bench-vs-nightly.yml | 4 +- bench_vs/README.md | 18 ++- bench_vs/run.sh | 210 +++++++++++++++++++------ 3 files changed, 175 insertions(+), 57 deletions(-) diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml index eb063c5ae..68f8d5fac 100644 --- a/.github/workflows/bench-vs-nightly.yml +++ b/.github/workflows/bench-vs-nightly.yml @@ -43,14 +43,14 @@ jobs: - name: Run nightly benchmark run: | bash ./bench_vs/run.sh \ - -n 500000 1000000 1500000 2000000 \ + --steps 1000000 2000000 4000000 8000000 \ --report-dir bench_vs_artifacts \ --no-color - name: Upload nightly benchmark artifact uses: actions/upload-artifact@v4 with: - name: bench-vs-nightly-${{ github.sha }} + name: bench-vs-nightly-${{ github.run_number }}-${{ github.sha }} path: bench_vs_artifacts retention-days: 90 diff --git a/bench_vs/README.md b/bench_vs/README.md index 1be30c5d2..1e8a8d9f3 100644 --- a/bench_vs/README.md +++ b/bench_vs/README.md @@ -29,6 +29,9 @@ Compares proving time for an identical u64 wrapping Fibonacci computation. # Custom series ./bench_vs/run.sh -n 1000 50000 +# Approximate workload steps (converted with 5 steps/iteration) +./bench_vs/run.sh --steps 1000000 2000000 4000000 8000000 + # Run only one prover ./bench_vs/run.sh --lambda-only ./bench_vs/run.sh --sp1-only @@ -42,18 +45,21 @@ Only **proving time** is compared (wall-clock, no recursion/compression on eithe - **Lambda VM**: Generates RISC-V assembly at runtime, assembles to ELF, proves via the CLI. - **SP1 v6**: Compiles a Rust guest program to RISC-V, proves via `sp1-sdk` core mode. +The linear projection uses a common axis for both provers: target workload steps. +When you pass `--steps`, that target is explicit. When you pass `-n`, the script +approximates workload as `steps ~= 5 * n`. `SP1 cycles` are still reported, but +only as telemetry and not as the regression axis. + ## Output ``` === Summary === Program: Fibonacci (u64 wrapping) - n Lambda VM SP1 v6 Ratio - --- --------- ------ ----- - 1000 13.3s 12.4s 0.9x - 10000 22.4s 12.9s 0.6x - 100000 116.4s 14.7s 0.1x - 300000 ... ... ... + Target steps Iterations Lambda VM SP1 v6 SP1 cycles Ratio + ------------ ---------- --------- ------ ---------- ----- + 1000000 200000 ...s ...s 1004794 ... + 2000000 400000 ...s ...s 2004794 ... Green ratio = Lambda VM faster, Red = SP1 faster ``` diff --git a/bench_vs/run.sh b/bench_vs/run.sh index 5592e095c..7e3b06c23 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -1,10 +1,13 @@ #!/bin/bash # Benchmark: Lambda VM vs SP1 v6 — Fibonacci proving time comparison. # -# Usage: ./bench_vs/run.sh [-n 1000 50000 100000] [--lambda-only | --sp1-only] -# [--report-dir DIR] [--no-color] +# Usage: ./bench_vs/run.sh [-n 1000 50000 100000 | --steps 1000000 2000000] +# [--lambda-only | --sp1-only] [--report-dir DIR] +# [--target-steps N] [--no-color] # -# Without -n, runs the default series: 1000 10000 100000 300000 +# Without an explicit series, defaults to: +# - iterations mode: 1000 10000 100000 300000 +# - steps mode: 1000000 2000000 4000000 8000000 # # Prerequisites: # - Lambda VM CLI build dependencies available @@ -18,7 +21,8 @@ ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="/tmp/bench_fib" REPORT_DIR="" NO_COLOR=false -TARGET_STEPS=500000000 +TARGET_STEPS="${TARGET_STEPS:-500000000}" +APPROX_STEPS_PER_ITERATION=5 RED='\033[0;31m' GREEN='\033[0;32m' @@ -27,8 +31,10 @@ BOLD='\033[1m' NC='\033[0m' # --- Defaults --------------------------------------------------------------- -DEFAULT_SERIES=(1000 10000 100000 300000) +DEFAULT_ITERATION_SERIES=(1000 10000 100000 300000) +DEFAULT_STEP_SERIES=(1000000 2000000 4000000 8000000) SERIES=() +SERIES_MODE="" RUN_LAMBDA=true RUN_SP1=true @@ -36,6 +42,23 @@ RUN_SP1=true while [[ $# -gt 0 ]]; do case $1 in -n) + if [ -n "$SERIES_MODE" ] && [ "$SERIES_MODE" != "iterations" ]; then + echo "Cannot mix -n with --steps" + exit 1 + fi + SERIES_MODE="iterations" + shift + while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do + SERIES+=("$1") + shift + done + ;; + --steps) + if [ -n "$SERIES_MODE" ] && [ "$SERIES_MODE" != "steps" ]; then + echo "Cannot mix --steps with -n" + exit 1 + fi + SERIES_MODE="steps" shift while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do SERIES+=("$1") @@ -51,21 +74,30 @@ while [[ $# -gt 0 ]]; do shift ;; --report-dir) + if [[ $# -lt 2 ]]; then echo "--report-dir requires an argument"; exit 1; fi REPORT_DIR=$2 shift 2 ;; + --target-steps) + if [[ $# -lt 2 ]]; then echo "--target-steps requires an argument"; exit 1; fi + TARGET_STEPS=$2 + shift 2 + ;; --no-color) NO_COLOR=true shift ;; -h|--help) - echo "Usage: $0 [-n N1 N2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--no-color]" + echo "Usage: $0 [-n N1 N2 ... | --steps S1 S2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--target-steps N] [--no-color]" echo "" echo " -n N1 N2 ... Fibonacci iteration counts (space-separated)" - echo " Default series: ${DEFAULT_SERIES[*]}" + echo " Default iteration series: ${DEFAULT_ITERATION_SERIES[*]}" + echo " --steps S1 S2 ... Approximate workload steps; converted via ${APPROX_STEPS_PER_ITERATION} steps/iteration" + echo " Default step series: ${DEFAULT_STEP_SERIES[*]}" echo " --lambda-only Only run Lambda VM benchmark" echo " --sp1-only Only run SP1 benchmark" echo " --report-dir DIR Write TSV, metrics, markdown summary, and raw outputs" + echo " --target-steps N Projection target in workload steps (default: $TARGET_STEPS)" echo " --no-color Disable ANSI colors" exit 0 ;; @@ -73,11 +105,19 @@ while [[ $# -gt 0 ]]; do echo "Unknown option: $1" exit 1 ;; - esac + esac done +if [ -z "$SERIES_MODE" ]; then + SERIES_MODE="iterations" +fi + if [ ${#SERIES[@]} -eq 0 ]; then - SERIES=("${DEFAULT_SERIES[@]}") + if [ "$SERIES_MODE" = "steps" ]; then + SERIES=("${DEFAULT_STEP_SERIES[@]}") + else + SERIES=("${DEFAULT_ITERATION_SERIES[@]}") + fi fi if ! $RUN_LAMBDA && ! $RUN_SP1; then @@ -109,6 +149,20 @@ join_slash() { printf "%s\n" "$joined" } +approx_steps_for_iterations() { + local iterations=$1 + awk -v iterations="$iterations" -v ratio="$APPROX_STEPS_PER_ITERATION" 'BEGIN { + printf "%.0f\n", iterations * ratio + }' +} + +approx_iterations_for_steps() { + local steps=$1 + awk -v steps="$steps" -v ratio="$APPROX_STEPS_PER_ITERATION" 'BEGIN { + printf "%.0f\n", steps / ratio + }' +} + fit_series() { local steps_slash=$1 local values_slash=$2 @@ -197,7 +251,9 @@ PY } echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}" -echo -e "Series: ${YELLOW}${SERIES[*]}${NC}" +echo -e "Series mode: ${YELLOW}${SERIES_MODE}${NC}" +echo -e "Requested series: ${YELLOW}${SERIES[*]}${NC}" +echo -e "Projection target: ${YELLOW}${TARGET_STEPS}${NC} workload steps" echo "" # --- Pre-build -------------------------------------------------------------- @@ -242,30 +298,60 @@ fi # --- Run benchmark series --------------------------------------------------- -RESULT_N=() +RUN_ITERATIONS=() +RUN_TARGET_STEPS=() +for value in "${SERIES[@]}"; do + if [ "$SERIES_MODE" = "steps" ]; then + target_steps=$value + iterations=$(approx_iterations_for_steps "$target_steps") + else + iterations=$value + target_steps=$(approx_steps_for_iterations "$iterations") + fi + + if [ "$iterations" -le 0 ]; then + echo "Invalid series value: $value" + exit 1 + fi + + RUN_ITERATIONS+=("$iterations") + RUN_TARGET_STEPS+=("$target_steps") +done + +if [ "$SERIES_MODE" = "steps" ]; then + echo -e "Iterations used: ${YELLOW}${RUN_ITERATIONS[*]}${NC}" + echo "" +fi + +RESULT_TARGET_STEPS=() +RESULT_ITERATIONS=() +RESULT_PROJECTION_STEPS=() RESULT_LAMBDA=() RESULT_SP1=() RESULT_SP1_CYCLES=() RESULT_RATIO=() -LAMBDA_STEPS=() +LAMBDA_PROJECTION_STEPS=() LAMBDA_TIMES=() -SP1_STEPS=() +SP1_PROJECTION_STEPS=() SP1_TIMES=() +PROJECTION_AXIS="target_workload_steps" if [ -n "$REPORT_DIR" ]; then - printf "n\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv" + printf "target_steps\titerations\tprojection_steps\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv" fi run_one() { local n=$1 + local target_steps=$2 local lambda_time="n/a" local sp1_time="n/a" local sp1_cycles="n/a" + local projection_steps=$target_steps local ratio="n/a" echo "" - echo -e "${BOLD}--- n=${n} ---${NC}" + echo -e "${BOLD}--- target≈${target_steps} steps (n=${n} iterations) ---${NC}" if $RUN_LAMBDA; then local input_file="$TMP_DIR/lambda_${n}.bin" @@ -290,8 +376,6 @@ run_one() { fi echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC}" - LAMBDA_STEPS+=("$n") - LAMBDA_TIMES+=("$lambda_time") if [ -n "$REPORT_DIR" ]; then printf "%s\n" "$lambda_output" > "$REPORT_DIR/raw/lambda_${n}.stdout" @@ -317,8 +401,6 @@ run_one() { fi echo -e " SP1 v6: ${BOLD}${sp1_time}s${NC} (${sp1_cycles} cycles)" - SP1_STEPS+=("$n") - SP1_TIMES+=("$sp1_time") if [ -n "$REPORT_DIR" ]; then cp "$sp1_output_file" "$REPORT_DIR/raw/sp1_${n}.stdout" @@ -329,19 +411,37 @@ run_one() { ratio=$(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { printf "%.3f", lambda / sp1 }') fi - RESULT_N+=("$n") + if [ "$lambda_time" != "n/a" ]; then + LAMBDA_PROJECTION_STEPS+=("$target_steps") + LAMBDA_TIMES+=("$lambda_time") + fi + if [ "$sp1_time" != "n/a" ]; then + SP1_PROJECTION_STEPS+=("$target_steps") + SP1_TIMES+=("$sp1_time") + fi + + RESULT_TARGET_STEPS+=("$target_steps") + RESULT_ITERATIONS+=("$n") + RESULT_PROJECTION_STEPS+=("$projection_steps") RESULT_LAMBDA+=("$lambda_time") RESULT_SP1+=("$sp1_time") RESULT_SP1_CYCLES+=("$sp1_cycles") RESULT_RATIO+=("$ratio") if [ -n "$REPORT_DIR" ]; then - printf "%s\t%s\t%s\t%s\t%s\n" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" "$ratio" >> "$REPORT_DIR/results.tsv" + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ + "$target_steps" \ + "$n" \ + "$projection_steps" \ + "$lambda_time" \ + "$sp1_time" \ + "$sp1_cycles" \ + "$ratio" >> "$REPORT_DIR/results.tsv" fi } -for n in "${SERIES[@]}"; do - run_one "$n" +for i in "${!RUN_ITERATIONS[@]}"; do + run_one "${RUN_ITERATIONS[$i]}" "${RUN_TARGET_STEPS[$i]}" done # --- Projection ------------------------------------------------------------- @@ -390,11 +490,11 @@ compute_projection() { esac } -if $RUN_LAMBDA && [ ${#LAMBDA_STEPS[@]} -gt 0 ]; then - compute_projection "lambda" "$(join_slash "${LAMBDA_STEPS[@]}")" "$(join_slash "${LAMBDA_TIMES[@]}")" +if $RUN_LAMBDA && [ ${#LAMBDA_PROJECTION_STEPS[@]} -gt 0 ]; then + compute_projection "lambda" "$(join_slash "${LAMBDA_PROJECTION_STEPS[@]}")" "$(join_slash "${LAMBDA_TIMES[@]}")" fi -if $RUN_SP1 && [ ${#SP1_STEPS[@]} -gt 0 ]; then - compute_projection "sp1" "$(join_slash "${SP1_STEPS[@]}")" "$(join_slash "${SP1_TIMES[@]}")" +if $RUN_SP1 && [ ${#SP1_PROJECTION_STEPS[@]} -gt 0 ]; then + compute_projection "sp1" "$(join_slash "${SP1_PROJECTION_STEPS[@]}")" "$(join_slash "${SP1_TIMES[@]}")" fi # --- Summary table ---------------------------------------------------------- @@ -405,18 +505,19 @@ echo -e "Program: Fibonacci (u64 wrapping)" echo "" if $RUN_LAMBDA && $RUN_SP1; then - printf " %-10s %12s %12s %12s %8s\n" "n" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio" - printf " %-10s %12s %12s %12s %8s\n" "---" "---------" "------" "----------" "-----" + printf " %-12s %-12s %12s %12s %12s %8s\n" "Target steps" "Iterations" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio" + printf " %-12s %-12s %12s %12s %12s %8s\n" "------------" "----------" "---------" "------" "----------" "-----" elif $RUN_LAMBDA; then - printf " %-10s %12s\n" "n" "Lambda VM" - printf " %-10s %12s\n" "---" "---------" + printf " %-12s %-12s %12s\n" "Target steps" "Iterations" "Lambda VM" + printf " %-12s %-12s %12s\n" "------------" "----------" "---------" else - printf " %-10s %12s %12s\n" "n" "SP1 v6" "SP1 cycles" - printf " %-10s %12s %12s\n" "---" "------" "----------" + printf " %-12s %-12s %12s %12s\n" "Target steps" "Iterations" "SP1 v6" "SP1 cycles" + printf " %-12s %-12s %12s %12s\n" "------------" "----------" "------" "----------" fi -for i in "${!RESULT_N[@]}"; do - n="${RESULT_N[$i]}" +for i in "${!RESULT_ITERATIONS[@]}"; do + target_steps="${RESULT_TARGET_STEPS[$i]}" + n="${RESULT_ITERATIONS[$i]}" lambda_time="${RESULT_LAMBDA[$i]}" sp1_time="${RESULT_SP1[$i]}" sp1_cycles="${RESULT_SP1_CYCLES[$i]}" @@ -430,15 +531,15 @@ for i in "${!RESULT_N[@]}"; do else ratio_colored="${GREEN}${ratio_colored}${NC}" fi - printf " %-10s %11ss %11ss %12s " "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" + printf " %-12s %-12s %11ss %11ss %12s " "$target_steps" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" echo -e "$ratio_colored" else - printf " %-10s %12s %12s %12s %8s\n" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-" + printf " %-12s %-12s %12s %12s %12s %8s\n" "$target_steps" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-" fi elif $RUN_LAMBDA; then - printf " %-10s %11ss\n" "$n" "$lambda_time" + printf " %-12s %-12s %11ss\n" "$target_steps" "$n" "$lambda_time" else - printf " %-10s %11ss %12s\n" "$n" "$sp1_time" "$sp1_cycles" + printf " %-12s %-12s %11ss %12s\n" "$target_steps" "$n" "$sp1_time" "$sp1_cycles" fi done @@ -450,7 +551,9 @@ echo "Raw data in $TMP_DIR/" if [ -n "$LAMBDA_PROJECTED_S" ] || [ -n "$SP1_PROJECTED_S" ]; then echo "" - echo -e "${BOLD}=== Linear Projection to 500M Steps ===${NC}" + echo -e "${BOLD}=== Linear Projection to ${TARGET_STEPS} Workload Steps ===${NC}" + echo " Axis: target workload steps" + echo " Note: when using iterations input, target steps are approximated as ${APPROX_STEPS_PER_ITERATION} * n" if [ -n "$LAMBDA_PROJECTED_S" ]; then echo " Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R²=${LAMBDA_R2}" fi @@ -463,8 +566,13 @@ fi if [ -n "$REPORT_DIR" ]; then { + echo "series_mode=$SERIES_MODE" + echo "requested_series=$(join_slash "${SERIES[@]}")" + echo "target_steps_series=$(join_slash "${RESULT_TARGET_STEPS[@]}")" + echo "iterations=$(join_slash "${RESULT_ITERATIONS[@]}")" + echo "projection_axis=$PROJECTION_AXIS" echo "target_steps=$TARGET_STEPS" - echo "series=$(join_slash "${RESULT_N[@]}")" + echo "projection_steps=$(join_slash "${RESULT_PROJECTION_STEPS[@]}")" echo "lambda_times=$(join_slash "${RESULT_LAMBDA[@]}")" echo "sp1_times=$(join_slash "${RESULT_SP1[@]}")" echo "sp1_cycles=$(join_slash "${RESULT_SP1_CYCLES[@]}")" @@ -488,21 +596,25 @@ if [ -n "$REPORT_DIR" ]; then { echo "# Lambda VM vs SP1 v6 Benchmark" echo - echo "| n | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |" - echo "|--:|--------------:|-----------:|-----------:|------:|" - for i in "${!RESULT_N[@]}"; do - printf "| %s | %s | %s | %s | %s |\n" \ - "${RESULT_N[$i]}" \ + echo "Projection axis: \`$PROJECTION_AXIS\`" + echo + echo "| Target steps | Iterations | Projection steps | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |" + echo "|-------------:|-----------:|-----------------:|--------------:|-----------:|-----------:|------:|" + for i in "${!RESULT_ITERATIONS[@]}"; do + printf "| %s | %s | %s | %s | %s | %s | %s |\n" \ + "${RESULT_TARGET_STEPS[$i]}" \ + "${RESULT_ITERATIONS[$i]}" \ + "${RESULT_PROJECTION_STEPS[$i]}" \ "${RESULT_LAMBDA[$i]}" \ "${RESULT_SP1[$i]}" \ "${RESULT_SP1_CYCLES[$i]}" \ "${RESULT_RATIO[$i]}" done echo - echo "## Linear Projection to 500M Steps" + echo "## Linear Projection to ${TARGET_STEPS} Workload Steps" echo - echo "| Prover | Slope (s / 1M steps) | Intercept (s) | R² | Projected @ 500M (s) | Projected @ 500M (h) |" - echo "|--------|----------------------:|--------------:|---:|---------------------:|---------------------:|" + echo "| Prover | Slope (s / 1M workload steps) | Intercept (s) | R² | Projected @ ${TARGET_STEPS} (s) | Projected @ ${TARGET_STEPS} (h) |" + echo "|--------|-------------------------------:|--------------:|---:|------------------------------:|------------------------------:|" if [ -n "$LAMBDA_PROJECTED_S" ]; then printf "| Lambda VM | %s | %s | %s | %s | %s |\n" \ "$LAMBDA_SLOPE" \ From e9e9fbdebc2666fd44ed9854d27f239fcf973f5a Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 14 Apr 2026 09:54:08 -0300 Subject: [PATCH 13/34] Switch bench_vs projection to measured cycles and add --cycles CLI flag --- bench_vs/README.md | 59 +++++++--- bench_vs/run.sh | 129 ++++++++++++++-------- bench_vs/sp1/fibonacci/script/src/main.rs | 14 ++- bin/cli/src/main.rs | 42 ++++++- 4 files changed, 179 insertions(+), 65 deletions(-) diff --git a/bench_vs/README.md b/bench_vs/README.md index 1e8a8d9f3..c38daa601 100644 --- a/bench_vs/README.md +++ b/bench_vs/README.md @@ -32,23 +32,49 @@ Compares proving time for an identical u64 wrapping Fibonacci computation. # Approximate workload steps (converted with 5 steps/iteration) ./bench_vs/run.sh --steps 1000000 2000000 4000000 8000000 +# Project to a target cycle count +./bench_vs/run.sh --target-cycles 500000000 + # Run only one prover ./bench_vs/run.sh --lambda-only ./bench_vs/run.sh --sp1-only ``` -## What it measures +## What is measured Both provers execute the same program: iterative Fibonacci with `u64::wrapping_add`. -Only **proving time** is compared (wall-clock, no recursion/compression on either side). - -- **Lambda VM**: Generates RISC-V assembly at runtime, assembles to ELF, proves via the CLI. -- **SP1 v6**: Compiles a Rust guest program to RISC-V, proves via `sp1-sdk` core mode. -The linear projection uses a common axis for both provers: target workload steps. -When you pass `--steps`, that target is explicit. When you pass `-n`, the script -approximates workload as `steps ~= 5 * n`. `SP1 cycles` are still reported, but -only as telemetry and not as the regression axis. +The timing window on both sides is **end-to-end single-shot proving, with no +verification and no recursion/compression**. Concretely: + +| Phase | Lambda VM timer | SP1 v6 timer | +|--------------------------------------------|:---------------:|:------------:| +| Read ELF + input from disk | ❌ | ❌ | +| Pre-pass execution to count cycles | ❌ | ❌ | +| `setup` / verifying-key derivation | N/A (none) | ✅ | +| ELF parse + guest execution (inside prove) | ✅ | ✅ | +| Trace build | ✅ | ✅ | +| AIR construction | ✅ | ✅ | +| STARK prove (`core` mode) | ✅ | ✅ | +| Proof serialization / write | ❌ | ❌ | +| Verify | ❌ | ❌ | + +Both sides run one extra execution pass **outside** the timer to report dynamic +instruction counts (SP1's `execute(...)` / Lambda's executor pre-pass). This +costs wall-clock time in the CI job but does not inflate the measured proving +time, and the cost is symmetric between the two provers. + +Lambda VM uses the default proof options from `prover::prove_with_inputs` +(`GoldilocksCubicProofOptions::with_blowup(2)`, 50 FRI queries). SP1 v6 uses +the `core` proof mode exposed by `sp1-sdk::ProverClient::from_env()`. + +## Projection axis + +The linear projection uses **measured cycles** per prover — Lambda's executor +log count and SP1's `report.total_instruction_count()`. For Fibonacci the two +values agree to within ~1% (both compile to the same inner loop shape on +RISC-V). When cycle data is missing, the script falls back to the approximate +`target_workload_steps ~= 5 * n` label that was passed on the command line. ## Output @@ -56,10 +82,17 @@ only as telemetry and not as the regression axis. === Summary === Program: Fibonacci (u64 wrapping) - Target steps Iterations Lambda VM SP1 v6 SP1 cycles Ratio - ------------ ---------- --------- ------ ---------- ----- - 1000000 200000 ...s ...s 1004794 ... - 2000000 400000 ...s ...s 2004794 ... + Target steps Iterations Lambda (s) Lambda cycles SP1 (s) SP1 cycles Ratio + ------------ ---------- ---------- ------------- ------- ---------- ----- + 1000000 200000 ...s 1004794 ...s 1004794 ... + 2000000 400000 ...s 2004794 ...s 2004794 ... +Timing window covers single-shot end-to-end proving; SP1 includes setup; both exclude verification. Green ratio = Lambda VM faster, Red = SP1 faster ``` + +With `--report-dir DIR` the script writes: +- `results.tsv` — raw per-run data (`target_steps`, `iterations`, `lambda_time_s`, `lambda_axis_value`, `lambda_cycles`, `sp1_time_s`, `sp1_axis_value`, `sp1_cycles`, `ratio`). +- `metrics.txt` — key=value pairs including `timing_window=setup_plus_end_to_end_prove_no_verify`. +- `summary.md` — the same table plus linear projection to `TARGET_CYCLES` cycles. +- `raw/` — stdout/stderr of every individual run. diff --git a/bench_vs/run.sh b/bench_vs/run.sh index 7e3b06c23..f72f3731e 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -3,7 +3,7 @@ # # Usage: ./bench_vs/run.sh [-n 1000 50000 100000 | --steps 1000000 2000000] # [--lambda-only | --sp1-only] [--report-dir DIR] -# [--target-steps N] [--no-color] +# [--target-cycles N] [--no-color] # # Without an explicit series, defaults to: # - iterations mode: 1000 10000 100000 300000 @@ -21,7 +21,7 @@ ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="/tmp/bench_fib" REPORT_DIR="" NO_COLOR=false -TARGET_STEPS="${TARGET_STEPS:-500000000}" +TARGET_CYCLES="${TARGET_CYCLES:-${TARGET_STEPS:-500000000}}" APPROX_STEPS_PER_ITERATION=5 RED='\033[0;31m' @@ -78,9 +78,15 @@ while [[ $# -gt 0 ]]; do REPORT_DIR=$2 shift 2 ;; + --target-cycles) + if [[ $# -lt 2 ]]; then echo "--target-cycles requires an argument"; exit 1; fi + TARGET_CYCLES=$2 + shift 2 + ;; --target-steps) if [[ $# -lt 2 ]]; then echo "--target-steps requires an argument"; exit 1; fi - TARGET_STEPS=$2 + TARGET_CYCLES=$2 + echo "Warning: --target-steps is deprecated; use --target-cycles" >&2 shift 2 ;; --no-color) @@ -88,7 +94,7 @@ while [[ $# -gt 0 ]]; do shift ;; -h|--help) - echo "Usage: $0 [-n N1 N2 ... | --steps S1 S2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--target-steps N] [--no-color]" + echo "Usage: $0 [-n N1 N2 ... | --steps S1 S2 ...] [--lambda-only | --sp1-only] [--report-dir DIR] [--target-cycles N] [--no-color]" echo "" echo " -n N1 N2 ... Fibonacci iteration counts (space-separated)" echo " Default iteration series: ${DEFAULT_ITERATION_SERIES[*]}" @@ -96,8 +102,9 @@ while [[ $# -gt 0 ]]; do echo " Default step series: ${DEFAULT_STEP_SERIES[*]}" echo " --lambda-only Only run Lambda VM benchmark" echo " --sp1-only Only run SP1 benchmark" - echo " --report-dir DIR Write TSV, metrics, markdown summary, and raw outputs" - echo " --target-steps N Projection target in workload steps (default: $TARGET_STEPS)" + echo " --report-dir DIR Write TSV, metrics, markdown summary, and raw outputs" + echo " --target-cycles N Projection target in cycles (default: $TARGET_CYCLES)" + echo " --target-steps N Deprecated alias for --target-cycles" echo " --no-color Disable ANSI colors" exit 0 ;; @@ -253,7 +260,7 @@ PY echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}" echo -e "Series mode: ${YELLOW}${SERIES_MODE}${NC}" echo -e "Requested series: ${YELLOW}${SERIES[*]}${NC}" -echo -e "Projection target: ${YELLOW}${TARGET_STEPS}${NC} workload steps" +echo -e "Projection target: ${YELLOW}${TARGET_CYCLES}${NC} cycles" echo "" # --- Pre-build -------------------------------------------------------------- @@ -325,9 +332,11 @@ fi RESULT_TARGET_STEPS=() RESULT_ITERATIONS=() -RESULT_PROJECTION_STEPS=() RESULT_LAMBDA=() +RESULT_LAMBDA_AXIS=() +RESULT_LAMBDA_CYCLES=() RESULT_SP1=() +RESULT_SP1_AXIS=() RESULT_SP1_CYCLES=() RESULT_RATIO=() @@ -335,23 +344,25 @@ LAMBDA_PROJECTION_STEPS=() LAMBDA_TIMES=() SP1_PROJECTION_STEPS=() SP1_TIMES=() -PROJECTION_AXIS="target_workload_steps" +# Axis: use measured dynamic instruction counts per prover. If cycle data is +# unavailable for a run, fall back to the approximated target_workload_steps. +PROJECTION_AXIS="measured_cycles" if [ -n "$REPORT_DIR" ]; then - printf "target_steps\titerations\tprojection_steps\tlambda_time_s\tsp1_time_s\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv" + printf "target_steps\titerations\tlambda_time_s\tlambda_axis_value\tlambda_cycles\tsp1_time_s\tsp1_axis_value\tsp1_cycles\tratio_lambda_over_sp1\n" > "$REPORT_DIR/results.tsv" fi run_one() { local n=$1 local target_steps=$2 local lambda_time="n/a" + local lambda_cycles="n/a" local sp1_time="n/a" local sp1_cycles="n/a" - local projection_steps=$target_steps local ratio="n/a" echo "" - echo -e "${BOLD}--- target≈${target_steps} steps (n=${n} iterations) ---${NC}" + echo -e "${BOLD}--- target~=${target_steps} steps (n=${n} iterations) ---${NC}" if $RUN_LAMBDA; then local input_file="$TMP_DIR/lambda_${n}.bin" @@ -361,7 +372,7 @@ run_one() { echo -e " ${GREEN}[Lambda VM] Proving...${NC}" local lambda_output - if ! lambda_output=$("$CLI" prove "$LAMBDA_ELF" -o "$proof_file" --private-input "$input_file" --time 2>"$stderr_file"); then + if ! lambda_output=$("$CLI" prove "$LAMBDA_ELF" -o "$proof_file" --private-input "$input_file" --time --cycles 2>"$stderr_file"); then echo -e " ${RED}[Lambda VM] FAILED:${NC}" cat "$stderr_file" exit 1 @@ -369,13 +380,21 @@ run_one() { rm -f "$proof_file" lambda_time=$(echo "$lambda_output" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') + lambda_cycles=$(echo "$lambda_output" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*') if [ -z "$lambda_time" ]; then echo -e " ${RED}[Lambda VM] FAILED: could not parse proving time${NC}" printf "%s\n" "$lambda_output" exit 1 fi + if [ -z "$lambda_cycles" ]; then + lambda_cycles="n/a" + fi - echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC}" + if [ "$lambda_cycles" != "n/a" ]; then + echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC} (${lambda_cycles} cycles)" + else + echo -e " Lambda VM: ${BOLD}${lambda_time}s${NC}" + fi if [ -n "$REPORT_DIR" ]; then printf "%s\n" "$lambda_output" > "$REPORT_DIR/raw/lambda_${n}.stdout" @@ -411,30 +430,45 @@ run_one() { ratio=$(LC_NUMERIC=C awk -v lambda="$lambda_time" -v sp1="$sp1_time" 'BEGIN { printf "%.3f", lambda / sp1 }') fi + # Axis selection per prover: use measured cycles when available, otherwise + # fall back to the approximated target_steps. + local lambda_axis="$target_steps" + if [ "$lambda_cycles" != "n/a" ]; then + lambda_axis="$lambda_cycles" + fi + local sp1_axis="$target_steps" + if [ "$sp1_cycles" != "n/a" ]; then + sp1_axis="$sp1_cycles" + fi + if [ "$lambda_time" != "n/a" ]; then - LAMBDA_PROJECTION_STEPS+=("$target_steps") + LAMBDA_PROJECTION_STEPS+=("$lambda_axis") LAMBDA_TIMES+=("$lambda_time") fi if [ "$sp1_time" != "n/a" ]; then - SP1_PROJECTION_STEPS+=("$target_steps") + SP1_PROJECTION_STEPS+=("$sp1_axis") SP1_TIMES+=("$sp1_time") fi RESULT_TARGET_STEPS+=("$target_steps") RESULT_ITERATIONS+=("$n") - RESULT_PROJECTION_STEPS+=("$projection_steps") RESULT_LAMBDA+=("$lambda_time") + RESULT_LAMBDA_AXIS+=("$lambda_axis") + RESULT_LAMBDA_CYCLES+=("$lambda_cycles") RESULT_SP1+=("$sp1_time") + RESULT_SP1_AXIS+=("$sp1_axis") RESULT_SP1_CYCLES+=("$sp1_cycles") RESULT_RATIO+=("$ratio") if [ -n "$REPORT_DIR" ]; then - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ "$target_steps" \ "$n" \ - "$projection_steps" \ "$lambda_time" \ + "$lambda_axis" \ + "$lambda_cycles" \ "$sp1_time" \ + "$sp1_axis" \ "$sp1_cycles" \ "$ratio" >> "$REPORT_DIR/results.tsv" fi @@ -469,7 +503,7 @@ compute_projection() { fi read -r slope intercept r2 <<< "$(fit_series "$steps_slash" "$times_slash")" - projected_s=$(project_series "$slope" "$intercept" "$TARGET_STEPS") + projected_s=$(project_series "$slope" "$intercept" "$TARGET_CYCLES") projected_h=$(format_hours "$projected_s") case "$label" in @@ -505,20 +539,21 @@ echo -e "Program: Fibonacci (u64 wrapping)" echo "" if $RUN_LAMBDA && $RUN_SP1; then - printf " %-12s %-12s %12s %12s %12s %8s\n" "Target steps" "Iterations" "Lambda VM" "SP1 v6" "SP1 cycles" "Ratio" - printf " %-12s %-12s %12s %12s %12s %8s\n" "------------" "----------" "---------" "------" "----------" "-----" + printf " %-12s %-12s %14s %14s %14s %14s %8s\n" "Target steps" "Iterations" "Lambda (s)" "Lambda cycles" "SP1 (s)" "SP1 cycles" "Ratio" + printf " %-12s %-12s %14s %14s %14s %14s %8s\n" "------------" "----------" "----------" "-------------" "-------" "----------" "-----" elif $RUN_LAMBDA; then - printf " %-12s %-12s %12s\n" "Target steps" "Iterations" "Lambda VM" - printf " %-12s %-12s %12s\n" "------------" "----------" "---------" + printf " %-12s %-12s %14s %14s\n" "Target steps" "Iterations" "Lambda (s)" "Lambda cycles" + printf " %-12s %-12s %14s %14s\n" "------------" "----------" "----------" "-------------" else - printf " %-12s %-12s %12s %12s\n" "Target steps" "Iterations" "SP1 v6" "SP1 cycles" - printf " %-12s %-12s %12s %12s\n" "------------" "----------" "------" "----------" + printf " %-12s %-12s %14s %14s\n" "Target steps" "Iterations" "SP1 (s)" "SP1 cycles" + printf " %-12s %-12s %14s %14s\n" "------------" "----------" "-------" "----------" fi for i in "${!RESULT_ITERATIONS[@]}"; do target_steps="${RESULT_TARGET_STEPS[$i]}" n="${RESULT_ITERATIONS[$i]}" lambda_time="${RESULT_LAMBDA[$i]}" + lambda_cycles="${RESULT_LAMBDA_CYCLES[$i]}" sp1_time="${RESULT_SP1[$i]}" sp1_cycles="${RESULT_SP1_CYCLES[$i]}" ratio="${RESULT_RATIO[$i]}" @@ -531,19 +566,20 @@ for i in "${!RESULT_ITERATIONS[@]}"; do else ratio_colored="${GREEN}${ratio_colored}${NC}" fi - printf " %-12s %-12s %11ss %11ss %12s " "$target_steps" "$n" "$lambda_time" "$sp1_time" "$sp1_cycles" + printf " %-12s %-12s %13ss %14s %13ss %14s " "$target_steps" "$n" "$lambda_time" "$lambda_cycles" "$sp1_time" "$sp1_cycles" echo -e "$ratio_colored" else - printf " %-12s %-12s %12s %12s %12s %8s\n" "$target_steps" "$n" "${lambda_time}s" "${sp1_time}s" "$sp1_cycles" "-" + printf " %-12s %-12s %14s %14s %14s %14s %8s\n" "$target_steps" "$n" "${lambda_time}s" "$lambda_cycles" "${sp1_time}s" "$sp1_cycles" "-" fi elif $RUN_LAMBDA; then - printf " %-12s %-12s %11ss\n" "$target_steps" "$n" "$lambda_time" + printf " %-12s %-12s %13ss %14s\n" "$target_steps" "$n" "$lambda_time" "$lambda_cycles" else - printf " %-12s %-12s %11ss %12s\n" "$target_steps" "$n" "$sp1_time" "$sp1_cycles" + printf " %-12s %-12s %13ss %14s\n" "$target_steps" "$n" "$sp1_time" "$sp1_cycles" fi done echo "" +echo -e "Timing window covers single-shot end-to-end proving; SP1 includes setup; both exclude verification." if $RUN_LAMBDA && $RUN_SP1; then echo -e "Green ratio = Lambda VM faster, Red = SP1 faster" fi @@ -551,14 +587,14 @@ echo "Raw data in $TMP_DIR/" if [ -n "$LAMBDA_PROJECTED_S" ] || [ -n "$SP1_PROJECTED_S" ]; then echo "" - echo -e "${BOLD}=== Linear Projection to ${TARGET_STEPS} Workload Steps ===${NC}" - echo " Axis: target workload steps" - echo " Note: when using iterations input, target steps are approximated as ${APPROX_STEPS_PER_ITERATION} * n" + echo -e "${BOLD}=== Linear Projection to ${TARGET_CYCLES} Cycles ===${NC}" + echo " Axis: measured dynamic instruction count per prover (cycles). When cycle data is" + echo " unavailable the script falls back to target_workload_steps ~= ${APPROX_STEPS_PER_ITERATION} * n." if [ -n "$LAMBDA_PROJECTED_S" ]; then - echo " Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R²=${LAMBDA_R2}" + echo " Lambda VM: ${LAMBDA_PROJECTED_S}s (${LAMBDA_PROJECTED_H}h), R2=${LAMBDA_R2}" fi if [ -n "$SP1_PROJECTED_S" ]; then - echo " SP1 v6: ${SP1_PROJECTED_S}s (${SP1_PROJECTED_H}h), R²=${SP1_R2}" + echo " SP1 v6: ${SP1_PROJECTED_S}s (${SP1_PROJECTED_H}h), R2=${SP1_R2}" fi fi @@ -571,10 +607,13 @@ if [ -n "$REPORT_DIR" ]; then echo "target_steps_series=$(join_slash "${RESULT_TARGET_STEPS[@]}")" echo "iterations=$(join_slash "${RESULT_ITERATIONS[@]}")" echo "projection_axis=$PROJECTION_AXIS" - echo "target_steps=$TARGET_STEPS" - echo "projection_steps=$(join_slash "${RESULT_PROJECTION_STEPS[@]}")" + echo "timing_window=setup_plus_end_to_end_prove_no_verify" + echo "target_cycles=$TARGET_CYCLES" echo "lambda_times=$(join_slash "${RESULT_LAMBDA[@]}")" + echo "lambda_axis_values=$(join_slash "${RESULT_LAMBDA_AXIS[@]}")" + echo "lambda_cycles=$(join_slash "${RESULT_LAMBDA_CYCLES[@]}")" echo "sp1_times=$(join_slash "${RESULT_SP1[@]}")" + echo "sp1_axis_values=$(join_slash "${RESULT_SP1_AXIS[@]}")" echo "sp1_cycles=$(join_slash "${RESULT_SP1_CYCLES[@]}")" echo "ratios=$(join_slash "${RESULT_RATIO[@]}")" if [ -n "$LAMBDA_PROJECTED_S" ]; then @@ -596,25 +635,27 @@ if [ -n "$REPORT_DIR" ]; then { echo "# Lambda VM vs SP1 v6 Benchmark" echo - echo "Projection axis: \`$PROJECTION_AXIS\`" + echo "Timing window: \`single-shot end-to-end prove\` (SP1 includes setup; both exclude verification and recursion)." + echo + echo "Projection axis: \`$PROJECTION_AXIS\` (measured dynamic instruction count per prover)." echo - echo "| Target steps | Iterations | Projection steps | Lambda VM (s) | SP1 v6 (s) | SP1 cycles | Ratio |" - echo "|-------------:|-----------:|-----------------:|--------------:|-----------:|-----------:|------:|" + echo "| Target steps | Iterations | Lambda VM (s) | Lambda cycles | SP1 v6 (s) | SP1 cycles | Ratio |" + echo "|-------------:|-----------:|--------------:|--------------:|-----------:|-----------:|------:|" for i in "${!RESULT_ITERATIONS[@]}"; do printf "| %s | %s | %s | %s | %s | %s | %s |\n" \ "${RESULT_TARGET_STEPS[$i]}" \ "${RESULT_ITERATIONS[$i]}" \ - "${RESULT_PROJECTION_STEPS[$i]}" \ "${RESULT_LAMBDA[$i]}" \ + "${RESULT_LAMBDA_CYCLES[$i]}" \ "${RESULT_SP1[$i]}" \ "${RESULT_SP1_CYCLES[$i]}" \ "${RESULT_RATIO[$i]}" done echo - echo "## Linear Projection to ${TARGET_STEPS} Workload Steps" + echo "## Linear Projection to ${TARGET_CYCLES} Cycles" echo - echo "| Prover | Slope (s / 1M workload steps) | Intercept (s) | R² | Projected @ ${TARGET_STEPS} (s) | Projected @ ${TARGET_STEPS} (h) |" - echo "|--------|-------------------------------:|--------------:|---:|------------------------------:|------------------------------:|" + echo "| Prover | Slope (s / 1M cycles) | Intercept (s) | R2 | Projected @ ${TARGET_CYCLES} (s) | Projected @ ${TARGET_CYCLES} (h) |" + echo "|--------|----------------------:|--------------:|---:|------------------------------:|------------------------------:|" if [ -n "$LAMBDA_PROJECTED_S" ]; then printf "| Lambda VM | %s | %s | %s | %s | %s |\n" \ "$LAMBDA_SLOPE" \ diff --git a/bench_vs/sp1/fibonacci/script/src/main.rs b/bench_vs/sp1/fibonacci/script/src/main.rs index 761d0c911..85730518a 100644 --- a/bench_vs/sp1/fibonacci/script/src/main.rs +++ b/bench_vs/sp1/fibonacci/script/src/main.rs @@ -17,18 +17,20 @@ fn main() { let mut stdin = SP1Stdin::new(); stdin.write(&n); - // Setup - let pk = client.setup(FIB_ELF.clone()).expect("setup failed"); - - // Execute for cycle count + // Cycle count — executed *before* the timer starts, matching Lambda's + // pre-pass for symmetry. This costs extra wall-clock but does not inflate + // the measured proving time. let (_, report) = client .execute(FIB_ELF.clone(), stdin.clone()) .run() .unwrap(); println!("Cycles: {}", report.total_instruction_count()); - // Core proof (no recursion) + // Timed window: end-to-end single-shot proving, including `setup` + // (verifying-key derivation) and the `core` proof itself. No recursion / + // compression, no verification. let start = Instant::now(); + let pk = client.setup(FIB_ELF.clone()).expect("setup failed"); let proof = client .prove(&pk, stdin) .core() @@ -38,7 +40,7 @@ fn main() { println!("Proving time: {:.3}s", elapsed.as_secs_f64()); - // Verify + // Verify (outside the timer, same as Lambda). client .verify(&proof, pk.verifying_key(), None) .expect("verify failed"); diff --git a/bin/cli/src/main.rs b/bin/cli/src/main.rs index 3a1917a32..162a201ef 100644 --- a/bin/cli/src/main.rs +++ b/bin/cli/src/main.rs @@ -128,9 +128,13 @@ enum Commands { #[arg(long)] blowup: Option, - /// Print timing breakdown + /// Print proving time #[arg(long)] time: bool, + + /// Execute one pre-pass outside the timer and print dynamic instruction count + #[arg(long)] + cycles: bool, }, /// Verify a proof bundle @@ -168,7 +172,8 @@ fn main() -> ExitCode { private_input, blowup, time, - } => cmd_prove(elf, output, private_input, blowup, time), + cycles, + } => cmd_prove(elf, output, private_input, blowup, time, cycles), Commands::Verify { proof, elf, @@ -290,6 +295,7 @@ fn cmd_prove( private_input_path: Option, blowup: Option, time: bool, + cycles: bool, ) -> ExitCode { eprintln!("Reading ELF file..."); let elf_data = match std::fs::read(&elf_path) { @@ -308,6 +314,35 @@ fn cmd_prove( } }; + // Pre-pass: execute once outside the timer to count dynamic instructions. + // Mirrors SP1's cycle-count pass so both provers report the same kind of + // number without inflating the measured proving time. + let cycle_count = if cycles { + let program = match Elf::load(&elf_data) { + Ok(p) => p, + Err(e) => { + eprintln!("Failed to load ELF for cycle count: {:?}", e); + return ExitCode::FAILURE; + } + }; + let executor = match Executor::new(&program, private_inputs.clone()) { + Ok(e) => e, + Err(e) => { + eprintln!("Failed to create executor for cycle count: {:?}", e); + return ExitCode::FAILURE; + } + }; + match executor.run() { + Ok(result) => Some(result.logs.len() as u64), + Err(e) => { + eprintln!("Execution failed during cycle count: {:?}", e); + return ExitCode::FAILURE; + } + } + } else { + None + }; + #[cfg(feature = "jemalloc-stats")] let tracker = heap_tracker::HeapTracker::start(); @@ -370,6 +405,9 @@ fn cmd_prove( } eprintln!("Proof written to {:?}", output_path); + if let Some(c) = cycle_count { + println!("Cycles: {}", c); + } if time { println!("Proving time: {:.3}s", prove_elapsed.as_secs_f64()); } From 1e1459e0e3e34fa0a329af25666d4de52d2daabf Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 14 Apr 2026 15:06:54 -0300 Subject: [PATCH 14/34] Fix grep pipefail in bench_vs/run.sh by switching to sed --- bench_vs/run.sh | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/bench_vs/run.sh b/bench_vs/run.sh index f72f3731e..3784c6357 100755 --- a/bench_vs/run.sh +++ b/bench_vs/run.sh @@ -257,6 +257,22 @@ with open(path, "wb") as fh: PY } +extract_proving_time() { + sed -nE '/Proving time: [0-9.]+s/ { + s/.*Proving time: ([0-9.]+)s.*/\1/ + p + q + }' +} + +extract_cycles() { + sed -nE '/Cycles: [0-9]+/ { + s/.*Cycles: ([0-9]+).*/\1/ + p + q + }' +} + echo -e "${BOLD}=== Fibonacci Benchmark: Lambda VM vs SP1 v6 ===${NC}" echo -e "Series mode: ${YELLOW}${SERIES_MODE}${NC}" echo -e "Requested series: ${YELLOW}${SERIES[*]}${NC}" @@ -379,8 +395,8 @@ run_one() { fi rm -f "$proof_file" - lambda_time=$(echo "$lambda_output" | grep -o 'Proving time: [0-9.]*s' | grep -o '[0-9.]*') - lambda_cycles=$(echo "$lambda_output" | grep -o 'Cycles: [0-9]*' | grep -o '[0-9]*') + lambda_time=$(printf "%s\n" "$lambda_output" | extract_proving_time) + lambda_cycles=$(printf "%s\n" "$lambda_output" | extract_cycles) if [ -z "$lambda_time" ]; then echo -e " ${RED}[Lambda VM] FAILED: could not parse proving time${NC}" printf "%s\n" "$lambda_output" @@ -411,8 +427,8 @@ run_one() { exit 1 fi - sp1_time=$(grep -o 'Proving time: [0-9.]*s' "$sp1_output_file" | grep -o '[0-9.]*') - sp1_cycles=$(grep -o 'Cycles: [0-9]*' "$sp1_output_file" | grep -o '[0-9]*') + sp1_time=$(extract_proving_time < "$sp1_output_file") + sp1_cycles=$(extract_cycles < "$sp1_output_file") if [ -z "$sp1_time" ] || [ -z "$sp1_cycles" ]; then echo -e " ${RED}[SP1 v6] FAILED: could not parse output${NC}" cat "$sp1_output_file" From 2e65d5aacd8f5095bc5aef7e66e972fe2f6b56c1 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Thu, 16 Apr 2026 18:04:22 -0300 Subject: [PATCH 15/34] Add bench_vs_plonky3 --- .github/workflows/bench-vs-p3-nightly.yml | 51 + Cargo.lock | 408 ++- Cargo.toml | 9 + bench_vs_plonky3/ANALYSIS_LOG.md | 432 +++ bench_vs_plonky3/Cargo.toml | 56 + bench_vs_plonky3/INSTRUMENTATION.md | 203 ++ bench_vs_plonky3/benches/stark_comparison.rs | 190 ++ .../p3-goldilocks-patched/Cargo.toml | 129 + .../benches/bench_field.rs | 72 + .../benches/extension.rs | 40 + .../src/aarch64_neon/mds.rs | 343 +++ .../src/aarch64_neon/mod.rs | 12 + .../src/aarch64_neon/packing.rs | 404 +++ .../src/aarch64_neon/poseidon1.rs | 716 +++++ .../src/aarch64_neon/poseidon1_asm.rs | 843 ++++++ .../src/aarch64_neon/poseidon2.rs | 652 ++++ .../src/aarch64_neon/poseidon2_asm.rs | 2621 +++++++++++++++++ .../src/aarch64_neon/utils.rs | 400 +++ .../p3-goldilocks-patched/src/extension.rs | 217 ++ .../p3-goldilocks-patched/src/goldilocks.rs | 813 +++++ .../p3-goldilocks-patched/src/lib.rs | 42 + .../p3-goldilocks-patched/src/mds.rs | 761 +++++ .../p3-goldilocks-patched/src/poseidon1.rs | 1143 +++++++ .../p3-goldilocks-patched/src/poseidon2.rs | 980 ++++++ .../src/x86_64_avx2/mds.rs | 86 + .../src/x86_64_avx2/mod.rs | 3 + .../src/x86_64_avx2/packing.rs | 539 ++++ .../src/x86_64_avx512/mds.rs | 86 + .../src/x86_64_avx512/mod.rs | 3 + .../src/x86_64_avx512/packing.rs | 444 +++ bench_vs_plonky3/run.sh | 410 +++ bench_vs_plonky3/src/bin/prove_bench.rs | 185 ++ bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 326 ++ bench_vs_plonky3/src/lib.rs | 341 +++ bench_vs_plonky3/src/plonky3_config.rs | 92 + bench_vs_plonky3/src/plonky3_fibonacci.rs | 144 + 36 files changed, 14171 insertions(+), 25 deletions(-) create mode 100644 .github/workflows/bench-vs-p3-nightly.yml create mode 100644 bench_vs_plonky3/ANALYSIS_LOG.md create mode 100644 bench_vs_plonky3/Cargo.toml create mode 100644 bench_vs_plonky3/INSTRUMENTATION.md create mode 100644 bench_vs_plonky3/benches/stark_comparison.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs create mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs create mode 100755 bench_vs_plonky3/run.sh create mode 100644 bench_vs_plonky3/src/bin/prove_bench.rs create mode 100644 bench_vs_plonky3/src/lambda_fibonacci_pair.rs create mode 100644 bench_vs_plonky3/src/lib.rs create mode 100644 bench_vs_plonky3/src/plonky3_config.rs create mode 100644 bench_vs_plonky3/src/plonky3_fibonacci.rs diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml new file mode 100644 index 000000000..b8602d7d4 --- /dev/null +++ b/.github/workflows/bench-vs-p3-nightly.yml @@ -0,0 +1,51 @@ +name: Bench Vs Plonky3 Nightly + +on: + schedule: + # 04:30 America/Argentina/Buenos_Aires = 07:30 UTC + # SP1 nightly fires at 06:00 UTC (03:00 BA) and runs ~1.5h; scheduling 1.5h + # later leaves the self-hosted bench runner free. + - cron: "30 7 * * *" + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: bench-vs-p3-nightly-${{ github.ref }} + cancel-in-progress: true + +jobs: + bench-vs-p3: + runs-on: [self-hosted, bench] + timeout-minutes: 60 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Rust Environment + uses: ./.github/actions/setup-rust + + - name: Add cargo to PATH + run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + + - name: Run nightly Plonky3 benchmark + run: | + bash ./bench_vs_plonky3/run.sh \ + --log-rows 19 \ + --num-sequences 16 \ + --runs 3 \ + --no-p3-patch \ + --scalar \ + --report-dir bench_vs_p3_artifacts \ + --no-color + + - name: Upload nightly benchmark artifact + uses: actions/upload-artifact@v4 + with: + name: bench-vs-p3-nightly-${{ github.run_number }}-${{ github.sha }} + path: bench_vs_p3_artifacts + retention-days: 90 + + - name: Publish summary + run: cat bench_vs_p3_artifacts/summary.md >> "$GITHUB_STEP_SUMMARY" diff --git a/Cargo.lock b/Cargo.lock index f6eea84d6..ae5305254 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -293,6 +293,30 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" +[[package]] +name = "bench-vs-plonky3" +version = "0.1.0" +dependencies = [ + "criterion 0.4.0", + "crypto", + "math", + "p3-air", + "p3-challenger", + "p3-commit", + "p3-dft 0.5.2", + "p3-field 0.5.2", + "p3-fri", + "p3-goldilocks", + "p3-keccak", + "p3-matrix 0.5.2", + "p3-merkle-tree", + "p3-symmetric 0.5.2", + "p3-uni-stark", + "stark", + "tracing", + "tracing-subscriber", +] + [[package]] name = "bincode" version = "1.3.3" @@ -1931,7 +1955,7 @@ dependencies = [ "serde_arrays", "sha2", "sp1_bls12_381", - "spin", + "spin 0.9.8", ] [[package]] @@ -2019,6 +2043,15 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.29" @@ -2221,6 +2254,17 @@ dependencies = [ "sha2", ] +[[package]] +name = "p3-air" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f2ec9cbfc642fc5173817287c3f8b789d07743b5f7e812d058b7a03e344f9ab" +dependencies = [ + "p3-field 0.5.2", + "p3-matrix 0.5.2", + "tracing", +] + [[package]] name = "p3-baby-bear" version = "0.2.3-succinct" @@ -2228,24 +2272,68 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7521838ecab2ddf4f7bc4ceebad06ec02414729598485c1ada516c39900820e8" dependencies = [ "num-bigint 0.4.6", - "p3-field", - "p3-mds", - "p3-poseidon2", - "p3-symmetric", + "p3-field 0.2.3-succinct", + "p3-mds 0.2.3-succinct", + "p3-poseidon2 0.2.3-succinct", + "p3-symmetric 0.2.3-succinct", "rand 0.8.5", "serde", ] +[[package]] +name = "p3-challenger" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a0b490c745a7d2adeeafff06411814c8078c432740162332b3cd71be0158a76" +dependencies = [ + "p3-field 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-monty-31", + "p3-symmetric 0.5.2", + "p3-util 0.5.2", + "tracing", +] + +[[package]] +name = "p3-commit" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "916ae7989d5c3b49f887f5c55b2f9826bdbb81aaebf834503c4145d8b267c829" +dependencies = [ + "itertools 0.14.0", + "p3-challenger", + "p3-dft 0.5.2", + "p3-field 0.5.2", + "p3-matrix 0.5.2", + "p3-util 0.5.2", + "serde", +] + [[package]] name = "p3-dft" version = "0.2.3-succinct" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46414daedd796f1eefcdc1811c0484e4bced5729486b6eaba9521c572c76761a" dependencies = [ - "p3-field", - "p3-matrix", - "p3-maybe-rayon", - "p3-util", + "p3-field 0.2.3-succinct", + "p3-matrix 0.2.3-succinct", + "p3-maybe-rayon 0.2.3-succinct", + "p3-util 0.2.3-succinct", + "tracing", +] + +[[package]] +name = "p3-dft" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55301e91544440254977108b85c32c09d7ea05f2f0dd61092a2825339906a4a7" +dependencies = [ + "itertools 0.14.0", + "p3-field 0.5.2", + "p3-matrix 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-util 0.5.2", + "spin 0.10.0", "tracing", ] @@ -2258,11 +2346,90 @@ dependencies = [ "itertools 0.12.1", "num-bigint 0.4.6", "num-traits", - "p3-util", + "p3-util 0.2.3-succinct", "rand 0.8.5", "serde", ] +[[package]] +name = "p3-field" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85affca7fc983889f260655c4cf74163eebb94605f702e4b6809ead707cba54f" +dependencies = [ + "itertools 0.14.0", + "num-bigint 0.4.6", + "p3-maybe-rayon 0.5.2", + "p3-util 0.5.2", + "paste", + "rand 0.10.1", + "serde", + "tracing", +] + +[[package]] +name = "p3-fri" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ac25574ed306b4c9ad1969faaecc0fe6081d45ad7e1ec236661a6e0e37b39e1" +dependencies = [ + "itertools 0.14.0", + "p3-challenger", + "p3-commit", + "p3-dft 0.5.2", + "p3-field 0.5.2", + "p3-interpolation", + "p3-matrix 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-util 0.5.2", + "rand 0.10.1", + "serde", + "spin 0.10.0", + "thiserror 2.0.17", + "tracing", +] + +[[package]] +name = "p3-goldilocks" +version = "0.5.2" +dependencies = [ + "num-bigint 0.4.6", + "p3-challenger", + "p3-dft 0.5.2", + "p3-field 0.5.2", + "p3-mds 0.5.2", + "p3-poseidon1", + "p3-poseidon2 0.5.2", + "p3-symmetric 0.5.2", + "p3-util 0.5.2", + "paste", + "rand 0.10.1", + "serde", +] + +[[package]] +name = "p3-interpolation" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14fd48db63ff15f5e96dc46e6991dbc2d39431b82dcb154bad90f4579236e328" +dependencies = [ + "p3-field 0.5.2", + "p3-matrix 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-util 0.5.2", +] + +[[package]] +name = "p3-keccak" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcf27615ece1995e4fcf4c69740f1cf515d1481367a20b4b3ce7f4f1b8d70f7" +dependencies = [ + "p3-symmetric 0.5.2", + "p3-util 0.5.2", + "tiny-keccak", +] + [[package]] name = "p3-matrix" version = "0.2.3-succinct" @@ -2270,20 +2437,44 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e4de3f373589477cb735ea58e125898ed20935e03664b4614c7fac258b3c42f" dependencies = [ "itertools 0.12.1", - "p3-field", - "p3-maybe-rayon", - "p3-util", + "p3-field 0.2.3-succinct", + "p3-maybe-rayon 0.2.3-succinct", + "p3-util 0.2.3-succinct", "rand 0.8.5", "serde", "tracing", ] +[[package]] +name = "p3-matrix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53428126b009071563d1d07305a9de8be0d21de00b57d2475289ee32ffca6577" +dependencies = [ + "itertools 0.14.0", + "p3-field 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-util 0.5.2", + "rand 0.10.1", + "serde", + "tracing", +] + [[package]] name = "p3-maybe-rayon" version = "0.2.3-succinct" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3968ad1160310296eb04f91a5f4edfa38fe1d6b2b8cd6b5c64e6f9b7370979e" +[[package]] +name = "p3-maybe-rayon" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "082bf467011c06c768c579ec6eb9accb5e1e62108891634cc770396e917f978a" +dependencies = [ + "rayon", +] + [[package]] name = "p3-mds" version = "0.2.3-succinct" @@ -2291,14 +2482,81 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2356b1ed0add6d5dfbf7a338ce534a6fde827374394a52cec16a0840af6e97c9" dependencies = [ "itertools 0.12.1", - "p3-dft", - "p3-field", - "p3-matrix", - "p3-symmetric", - "p3-util", + "p3-dft 0.2.3-succinct", + "p3-field 0.2.3-succinct", + "p3-matrix 0.2.3-succinct", + "p3-symmetric 0.2.3-succinct", + "p3-util 0.2.3-succinct", "rand 0.8.5", ] +[[package]] +name = "p3-mds" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35209e6214102ea6ec6b8cb1b9c15a9b8e597a39f9173597c957f123bced81b3" +dependencies = [ + "p3-dft 0.5.2", + "p3-field 0.5.2", + "p3-symmetric 0.5.2", + "p3-util 0.5.2", + "rand 0.10.1", +] + +[[package]] +name = "p3-merkle-tree" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "182a5383a54c50f47866f819946d28d95262f69967902734de8fdecb0d70c774" +dependencies = [ + "itertools 0.14.0", + "p3-commit", + "p3-field 0.5.2", + "p3-matrix 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-symmetric 0.5.2", + "p3-util 0.5.2", + "rand 0.10.1", + "serde", + "thiserror 2.0.17", + "tracing", +] + +[[package]] +name = "p3-monty-31" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffa8c99ec50c035020bbf5457c6a729ba6a975719c1a8dd3f16421081e4f650c" +dependencies = [ + "itertools 0.14.0", + "num-bigint 0.4.6", + "p3-dft 0.5.2", + "p3-field 0.5.2", + "p3-matrix 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-mds 0.5.2", + "p3-poseidon1", + "p3-poseidon2 0.5.2", + "p3-symmetric 0.5.2", + "p3-util 0.5.2", + "paste", + "rand 0.10.1", + "serde", + "spin 0.10.0", + "tracing", +] + +[[package]] +name = "p3-poseidon1" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a018b618e3fa0aec8be933b1d8e404edd23f46991f6bf3f5c2f3f95e9413fe9" +dependencies = [ + "p3-field 0.5.2", + "p3-symmetric 0.5.2", + "rand 0.10.1", +] + [[package]] name = "p3-poseidon2" version = "0.2.3-succinct" @@ -2306,13 +2564,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da1eec7e1b6900581bedd95e76e1ef4975608dd55be9872c9d257a8a9651c3a" dependencies = [ "gcd", - "p3-field", - "p3-mds", - "p3-symmetric", + "p3-field 0.2.3-succinct", + "p3-mds 0.2.3-succinct", + "p3-symmetric 0.2.3-succinct", "rand 0.8.5", "serde", ] +[[package]] +name = "p3-poseidon2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "256a668a9ba916f8767552f13d0ba50d18968bc74a623bfdafa41e2970c944d0" +dependencies = [ + "p3-field 0.5.2", + "p3-mds 0.5.2", + "p3-symmetric 0.5.2", + "p3-util 0.5.2", + "rand 0.10.1", +] + [[package]] name = "p3-symmetric" version = "0.2.3-succinct" @@ -2320,10 +2591,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb439bea1d822623b41ff4b51e3309e80d13cadf8b86d16ffd5e6efb9fdc360" dependencies = [ "itertools 0.12.1", - "p3-field", + "p3-field 0.2.3-succinct", + "serde", +] + +[[package]] +name = "p3-symmetric" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c60a71a1507c13611b0f2b0b6e83669fd5b76f8e3115bcbced5ccfdf3ca7807" +dependencies = [ + "itertools 0.14.0", + "p3-field 0.5.2", + "p3-util 0.5.2", "serde", ] +[[package]] +name = "p3-uni-stark" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4ecaad8a7b4cf0fc711278c7a29fdc6d14239157866b17feaf14061834bc51" +dependencies = [ + "itertools 0.14.0", + "p3-air", + "p3-challenger", + "p3-commit", + "p3-field 0.5.2", + "p3-matrix 0.5.2", + "p3-maybe-rayon 0.5.2", + "p3-util 0.5.2", + "serde", + "thiserror 2.0.17", + "tracing", +] + [[package]] name = "p3-util" version = "0.2.3-succinct" @@ -2333,6 +2635,16 @@ dependencies = [ "serde", ] +[[package]] +name = "p3-util" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8b766b9e9254bf3fa98d76e42cf8a5b30628c182dfd5272d270076ee12f0fc0" +dependencies = [ + "serde", + "transpose", +] + [[package]] name = "pairing" version = "0.23.0" @@ -2625,6 +2937,15 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -2663,6 +2984,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -2906,6 +3233,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "sec1" version = "0.7.3" @@ -3118,9 +3451,9 @@ dependencies = [ "lazy_static", "num-bigint 0.4.6", "p3-baby-bear", - "p3-field", - "p3-poseidon2", - "p3-symmetric", + "p3-field 0.2.3-succinct", + "p3-poseidon2 0.2.3-succinct", + "p3-symmetric 0.2.3-succinct", "serde", "sha2", ] @@ -3146,6 +3479,15 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" +dependencies = [ + "lock_api", +] + [[package]] name = "spki" version = "0.7.3" @@ -3189,6 +3531,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "strsim" version = "0.11.1" @@ -3572,6 +3920,16 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + [[package]] name = "typenum" version = "1.19.0" diff --git a/Cargo.toml b/Cargo.toml index 4d10b7c44..886c206f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "crypto/crypto", "crypto/math", "bin/cli", + "bench_vs_plonky3", ] resolver = "2" @@ -18,3 +19,11 @@ debug = true # For profiling with samply/perf, build with: # CARGO_PROFILE_RELEASE_DEBUG=1 cargo build --release + +# Patched p3-goldilocks adds a BinomiallyExtendable<3> impl for degree-3 +# extension (same as Lambda's x^3 - 2) and disables NEON packing on aarch64. +# Used only by bench_vs_plonky3 for apples-to-apples comparisons against +# Lambda STARK. The nightly workflow comments this block out at CI time to +# benchmark vanilla p3-goldilocks (degree-2 extension). +[patch.crates-io] +p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" } diff --git a/bench_vs_plonky3/ANALYSIS_LOG.md b/bench_vs_plonky3/ANALYSIS_LOG.md new file mode 100644 index 000000000..ab19e9a1f --- /dev/null +++ b/bench_vs_plonky3/ANALYSIS_LOG.md @@ -0,0 +1,432 @@ +# Lambda STARK vs Plonky3 — Analysis Log + +## Session: 2026-04-14 to 2026-04-16 + +--- + +## 0. Final Server Baseline (2026-04-16) + +**Config:** blowup=2, 219 queries, grinding=0, ext degree 3 both, scalar (no AVX2), parallel (rayon both), identical AIR (32 cols × 2^18). + +**Command:** `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench -p bench-vs-plonky3` + +### Prove + +| Prover | Time | Throughput | +|--------|------|------------| +| Lambda | **1.213 s** | 6.92 Melem/s | +| Plonky3 | **479 ms** | 17.50 Melem/s | +| **Ratio** | **2.53×** | | + +### Verify + +| Prover | Time | +|--------|------| +| Lambda | **23.3 ms** | +| Plonky3 | **20.4 ms** | +| **Ratio** | **1.14×** | + +### Gap attribution (734ms = 1213 - 479) + +Extension field is MATCHED (both degree 3). The 2.53× is pure algorithm/implementation: + +| Cause | Est. savings | % of gap | Effort | +|-------|-------------|----------|--------| +| **Quotient domain eval** (2^18 vs 2^19 LDE) | ~220ms | 30% | Low | +| **Batched FFT** (coset_lde_batch vs per-column) | ~150ms | 20% | Medium | +| **Alpha decomposition + monomorphization** | ~100ms | 14% | Medium-High | +| **FRI folding parallel** | ~73ms | 10% | Very low | +| **Boundary selectors** (vs zerofier precompute) | ~45ms | 6% | Low | +| **Memory allocation patterns** | ~37ms | 5% | Low | +| **SSE2 Keccak residual** (~7% hash advantage) | ~50ms | 7% | N/A (can't fix) | +| Other (compilation, unrolling, tuning) | ~59ms | 8% | - | + +### Predicted instruments breakdown (blowup=2, 219q) + +| Phase | Predicted time | % | +|-------|---------------|---| +| FRI queries (R4) | 180ms | 15% ← NEW bottleneck (2.19× queries) | +| R2 constraint eval | 168ms | 14% | +| R4 deep comp poly | 131ms | 11% | +| R1 Main Merkle | 105ms | 9% | +| R4 FRI commit | 76ms | 6% | +| R1 reconstruct LDE | 71ms | 6% | +| R3 OOD eval | 71ms | 6% | +| R1 Main LDE | 65ms | 5% | +| R4 deep extend | 52ms | 4% | +| R2 comp Merkle | 13ms | 1% | +| Pre-pass | 11ms | 1% | + +### Optimization roadmap (ranked by impact/effort) + +| # | Optimization | Savings | Effort | Result | +|---|-------------|---------|--------|--------| +| 1 | Quotient domain (stride=blowup in evaluator) | ~80ms | 1h | 1.13s | +| 2 | Parallel FRI fold (par_iter) | ~40ms | 30min | 1.09s | +| 3 | Boundary selectors (replace zerofier precompute) | ~45ms | 2h | 1.05s | +| 4 | LogUp alpha precompute | ~10ms | 30min | 1.04s | +| 5 | Monomorphize constraints (enum dispatch) | ~35ms | 4h | 1.00s | +| 6 | Batched FFT (coset_lde_batch pattern) | ~150ms | 8h | 0.85s | +| 7 | Row-major trace storage | ~20ms | 8h | 0.83s | + +**With items 1-5 (~210ms, ~8h work):** Lambda ~1.0s vs Plonky3 0.48s = **2.08×** +**With items 1-7 (~380ms, ~24h work):** Lambda ~0.83s vs Plonky3 0.48s = **1.73×** +**Remaining gap** after all: ~350ms from SSE2 Keccak + deep comp + Plonky3 micro-optimizations + +### M1 instruments breakdown (with PR #492, blowup=2, ext3 both) + +**Command:** `RUSTFLAGS="-C target-feature=-sha3" cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture` + +| Fase | Lambda (1.068s) | % | Plonky3 (352ms) | % | Ratio | +|------|-----------------|---|-----------------|---|-------| +| Trace commit (LDE+Merkle) | 317ms (LDE 127 + Merkle 165) | 30% | 138ms (commit to trace data) | 39% | 2.3× | +| **Constraint eval** | **325ms** | **30%** | **50ms** (quotient_values) | **14%** | **6.5×** | +| Quotient commit | 53ms | 5% | 49ms | 14% | 1.1× | +| OOD eval | 62ms | 6% | ~10ms (Lagrange interp) | 3% | 6.2× | +| Deep comp poly | 173ms | 16% | (inside "open") | | | +| Deep extend | 36ms | 3% | | | | +| FRI commit (folding+Merkle) | 83ms | 8% | 47ms (commit phase) | 13% | 1.8× | +| FRI queries | 1ms | 0% | 2ms (query phase) | 1% | — | +| Open total | 293ms | 27% | 110ms | 31% | 2.7× | +| Pre-pass | 7ms | 1% | — | | | + +--- + +## Fairness Audit + +### AIR equivalence: VERIFIED + +Both AIRs prove the same mathematical statement: +- 32 cols × 2^18 rows, 2-row window +- Constraint 1: `next_left = local_left + local_right` +- Constraint 2: `next_right = local_right + next_left` +- Boundary: row 0 pins `(a_s, b_s) = (s+1, s+2)` per sequence +- Test `lambda_pair_trace_matches_plonky3_trace` verifies ALL cells (not subset) +- Mathematical trace for seq (1,2): (1,2)→(3,5)→(8,13)→(21,34) — identical both sides + +### Parameters: ALL MATCHED (except noted) + +| Parameter | Lambda | Plonky3 | Status | +|-----------|--------|---------|--------| +| Base field | Goldilocks | Goldilocks | ✅ | +| Extension | degree 3 (`x³−2`) | degree 3 (`x³−2`, vendored) | ✅ | +| Blowup | 2 | 2 (log_blowup=1) | ✅ | +| FRI queries | 219 | 219 | ✅ | +| Grinding | 0 | 0 | ✅ | +| Hash | Keccak-256 | Keccak-256 | ✅ | +| Rayon | ON | ON (p3-uni-stark/parallel + p3-dft/parallel) | ✅ | +| SIMD Goldilocks | OFF | OFF (NEON patched to `Self`) | ✅ | +| SIMD Keccak (x86) | scalar (sha3 crate) | SSE2 2-wide | ⚠️ residual | +| SIMD Keccak (M1 with -sha3) | scalar | scalar (fallback) | ✅ | + +### Platform fairness guide + +| Platform | Command | Keccak P3 | Goldilocks P3 | Fairness | +|----------|---------|-----------|---------------|----------| +| **M1 + `-sha3`** | `RUSTFLAGS="-C target-feature=-sha3" cargo bench ...` | Scalar | Scalar | **100% fair** | +| M1 no flags | `cargo bench ...` | NEON SHA3 HW | Scalar | P3 has Keccak HW | +| **x86 + `-avx2,-avx512f`** | `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ...` | SSE2 2-wide | Scalar | ~93% fair | +| x86 no flags | `cargo bench ...` | AVX2 4-wide | AVX2 4-wide | P3 has full SIMD | + +**For fairest comparison: M1 with `-sha3`** — only platform where everything is scalar both sides. + +### Security model asymmetry (doesn't affect compute, affects interpretation) + +- **Lambda (Johnson Bound, proven):** 219 queries × 0.49 bits/query = **~108 bits** proven security +- **Plonky3 (ethSTARK conjecture):** 219 queries × 1.0 bit/query = **~219 bits** conjectured (cap 192 by field) +- Same 219 queries = same computational work. Different security interpretation. +- For "matched security" at 108 conjectured bits, P3 would need only ~108 queries (half the FRI work) + +### What's NOT unfairness (architectural differences = what we measure) + +These are implementation choices, not benchmark bias: +- Quotient domain eval (P3) vs full LDE eval (Lambda) → 6.5× constraint eval +- Monomorphization (P3) vs vtable dispatch (Lambda) → ~1.2× overhead +- Batched FFT (P3) vs per-column (Lambda) → ~2× trace commit +- Row-major (P3) vs column-major (Lambda) → cache efficiency +- Boundary selectors (P3) vs zerofier precompute (Lambda) → ~2× boundary cost + +### What IS potential unfairness + +1. SSE2 Keccak on x86 — P3 gets 2-wide Keccak, Lambda doesn't. ~7% of total. Unavoidable on x86. +2. Lambda samples NO extra LogUp/bus challenges for this AIR (verified: `has_aux_trace() = false` skips sampling). +3. Lambda wraps in `multi_prove` with vec of 1 — transcript clone overhead is negligible. + +**Conclusion: The benchmark is fair for comparing prover implementation efficiency.** + +--- + +## 1. Benchmark Setup + +### AIR (identical both sides) +- 16 Fibonacci sequences, 2 cols/sequence = **32 columns** +- **2^18 rows** (each row packs 2 Fibonacci steps → 2^19 effective steps) +- 2-row window: `next.left = local.left + local.right`, `next.right = local.right + next.left` +- 32 boundary constraints pinning initial values via public inputs +- Test `lambda_pair_trace_matches_plonky3_trace` verifies cell-by-cell equivalence + +### Matched parameters +- Base field: Goldilocks (p = 2^64 − 2^32 + 1) +- Blowup: 4 +- FRI queries: 100 +- Grinding: 0 +- Hash: Keccak-256 (scalar on both sides when `-C target-feature=-sha3`) + +### Unmatched (architectural) +- **Extension field:** Lambda degree 3 (`x^3 - 2`, 192-bit), Plonky3 degree 2 (`x^2 - 7`, 128-bit) + - Plonky3 0.5.2 has Goldilocks extensions for degree 2 and 5, but NOT degree 3 + - Lambda ext-mul: 9 base muls + 3 reduce128 + - Plonky3 ext-mul: 4 base muls + 2 adds +- **Prover architecture:** Lambda multi_prove (even for 1 AIR), Plonky3 uni-stark + +### Patches applied +1. `bench_vs_plonky3/vendor-p3-goldilocks/` — `Packing = Self` on aarch64 (disables NEON) +2. `p3-uni-stark` and `p3-dft` features `["parallel"]` enabled +3. `stark` feature `parallel` enabled by default in bench + +### Files +- `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` — Lambda AIR matching P3 shape +- `bench_vs_plonky3/src/plonky3_fibonacci.rs` — Plonky3 AIR +- `bench_vs_plonky3/src/plonky3_config.rs` — P3 config (matched FRI params) +- `bench_vs_plonky3/benches/stark_comparison.rs` — Criterion benchmark +- `bench_vs_plonky3/vendor-p3-goldilocks/` — Patched p3-goldilocks (no NEON) +- Root `Cargo.toml` — `[patch.crates-io]` for vendor p3-goldilocks + +--- + +## 2. Measurements + +### Config A: Both rayon, no SIMD, no SHA3 HW (M1 Max) + +Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3` + +| | Lambda | Plonky3 | Ratio | +|--|--------|---------|-------| +| **Prove** | **2.09s** [1.99, 2.20] | **0.86s** [0.84, 0.87] | **P3 2.43× faster** | +| **Verify** | **6.58ms** | **6.76ms** | **Lambda 1.03× faster** | + +### Config B: Lambda rayon ON, Plonky3 rayon OFF, NEON ON (M1 — earlier run) + +Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3` (before adding p3 parallel features) + +| | Lambda | Plonky3 | Ratio | +|--|--------|---------|-------| +| **Prove** | **3.46s** | **2.92s** | **P3 1.18× faster** | + +### Config C: Lambda rayon ON, Plonky3 rayon OFF, NEON ON, SHA3 HW ON (M1 — first run) + +Command: `cargo bench -p bench-vs-plonky3` (no RUSTFLAGS) + +| | Lambda | Plonky3 | Ratio | +|--|--------|---------|-------| +| **Prove** | **3.21s** | **1.67s** | **P3 1.92× faster** | + +### Server instruments breakdown (Lambda only, 16 cols × 2^18 pair AIR) + +Total: **1.246s** + +| Phase | Time | % | +|-------|------|---| +| R2 constraint eval | 336ms | 27% | +| R1 Main Merkle | 211ms | 17% | +| R1 reconstruct (re-LDE) | 143ms | 11% | +| R4 deep comp poly | 131ms | 11% | +| R1 Main LDE | 130ms | 10% | +| R4 FRI commit | 80ms | 6% | +| R3 OOD eval | 71ms | 6% | +| R2 comp Merkle | 54ms | 4% | +| R4 deep extend | 43ms | 3% | +| Pre-pass | 11ms | 1% | + +--- + +## 3. Root Cause Analysis + +### Why Plonky3 is ~2.4× faster (Config A) + +#### 3a. Constraint eval domain: 4× overhead (biggest factor) +- Lambda evaluates constraints on full LDE domain: `N × blowup = 2^20 points` (`evaluator.rs:274`) +- Plonky3 evaluates on quotient domain: `N = 2^18 points`, then extends via iFFT + FFT +- Lambda does 4× more constraint evaluations (each involving ext-field ops, frame fill, zerofier division) +- **Estimated contribution: 1.5-2× of the gap** + +#### 3b. Extension field degree 3 vs 2 +- Lambda: 9 base muls per ext-mul (`extensions_goldilocks.rs:293-309`) +- Plonky3: 4 base muls per ext-mul (`binomial_extension.rs:747-762`) +- Affects: composition poly, FRI folding, DEEP openings, OOD +- **Estimated contribution: 1.3-1.5× of the gap** + +#### 3c. Virtual dispatch vs monomorphization +- Lambda: `Vec>` → vtable call per constraint per point (`traits.rs:248-250`) +- Plonky3: `air.eval(&mut folder)` → monomorphized, all constraints inlined +- For 32 constraints × 2^20 points = 32M vtable dispatches in Lambda +- **Estimated contribution: 1.1-1.2× of the gap** + +#### 3d. Data layout: column-major vs row-major +- Lambda: column-major (cache miss per column access in constraint loop) +- Plonky3: row-major (contiguous data per row) +- **Estimated contribution: 1.05-1.1× of the gap** + +#### 3e. FRI folding sequential vs parallel +- Lambda: sequential loop in `fold_evaluations_in_place` (`fri_functions.rs:21`) +- Plonky3: `par_rows()` parallelized +- **Estimated contribution: 1.03-1.05× of the gap** + +#### Combined: 1.5 × 1.4 × 1.15 × 1.07 × 1.04 ≈ **2.7× (close to measured 2.43×)** + +### Why verify is roughly equal +- Verify doesn't do LDE, Merkle, or constraint eval +- Only ~100 point openings + FRI check +- Extension field penalty minimal at small N +- Lambda's implementation is competitive on this path + +--- + +## 4. SIMD Analysis (from profiling session) + +### NEON (aarch64/M1) +- `target_feature="neon"` and `target_feature="sha3"` are **default on aarch64-apple-darwin** +- Plonky3 uses `PackedGoldilocksNeon` (WIDTH=2) unconditionally on aarch64 via `#[cfg(target_arch = "aarch64")]` +- Plonky3 Keccak uses NEON SHA3 instructions (`veor3q_u64`, `vbcaxq_u64`, etc.) +- Lambda has NO SIMD in the prover +- **Goldilocks NEON base-field mul is 0.92× SLOWER** than scalar (no native 64×64→128 on NEON) +- **Fp3 NEON mul is 1.40× faster** (parallelism helps with 3 components) +- **FFT with SIMD was 0.88× (slower)** due to pack/unpack overhead + +### Disabling SIMD +- NEON packing: patched via `vendor-p3-goldilocks` (`type Packing = Self` on aarch64) +- SHA3 hardware Keccak: `-C target-feature=-sha3` (RUSTFLAGS) +- Cannot disable NEON via RUSTFLAGS alone (intrinsics used without `#[target_feature]` annotation) + +### x86_64 (server) +- Without `-C target-cpu=native`: only SSE2 (no AVX2) → Plonky3 scalar too +- With AVX2: `PackedGoldilocksAVX2` (WIDTH=4) — has native `mulq` so SIMD IS beneficial +- For fair scalar comparison on x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` + +--- + +## 5. Plonky3 Parallelism + +- `p3-maybe-rayon` feature `parallel` is NOT enabled by default +- Without it, all `par_iter()` calls fall back to `core::iter` (sequential) +- `Radix2DitParallel` is "parallel" in name only without the feature +- Must explicitly enable: `p3-uni-stark = { version = "0.5.2", features = ["parallel"] }` + `p3-dft = ...` +- Verified via `cargo tree -e features | grep p3-maybe-rayon` + +--- + +## 6. Lambda Profiling Results (server, profile_prover, 2^20 × 16 cols) + +### Single-threaded (38.7s) +| Component | % | Category | +|-----------|---|----------| +| Constraint evaluation | 32.1% | Compute | +| Keccak hashing | 15.1% | Hashing | +| Deep composition poly | 14.0% | Compute | +| Merkle tree build | 12.0% | Hashing | +| Field multiplication | 11.1% | Compute | +| FFT | 10.5% | FFT | +| Other | 5.2% | | + +### Parallel (12 threads, 19.2s — 2.02× speedup) +| Metric | Value | +|--------|-------| +| Parallel efficiency | 16.8% of ideal 12× | +| CPU utilization | 30.6% | +| Main thread work | 13.3s | +| Worker thread work | ~5s each | +| New #1 bottleneck | Keccak (16.7%) | + +### Key profiling findings +- 100% CPU-bound (no memory/IO stalls) +- SIMD PackedGoldilocks types exist but are NOT used by prover +- Iterator overhead (Map::fold + FnMut): 7.6% +- Memory allocation overhead: 8.9% (page faults + malloc + cfree) +- Amdahl's Law: ~34% serial portion limits parallel speedup + +--- + +## 7. Optimizations Implemented (then stashed) + +### Item 2: Parallel FRI folding +- File: `crypto/stark/src/fri/fri_functions.rs` +- Change: `(0..half).into_par_iter().map().collect()` with `#[cfg(feature = "parallel")]` +- Also: `crypto/stark/src/fri/mod.rs` — added `Send + Sync` bounds +- Tests: 450/450 passed (121 stark + 326 VM + 3 bench) + +### Item 3: Quotient domain constraint evaluation +- File: `crypto/stark/src/constraints/evaluator.rs` — added `lde_stride: usize` parameter +- File: `crypto/stark/src/prover.rs` — when `number_of_parts == 1`, uses `lde_stride = blowup_factor` + then extends N evaluations to LDE via `interpolate_offset_fft + evaluate_polynomial_on_lde_domain` +- Tests: 450/450 passed +- Impact on M1: 2.09s → 2.02s (~3%, within Criterion noise) +- Impact limited because iFFT+FFT extension cost offsets constraint eval savings + +### Why stashed +User wants clean baseline first (fair comparison), then optimize. These changes are ready to re-apply. + +--- + +## 8. Optimization Priority (from profiling data) + +### With parallel enabled (real-world scenario) + +| # | Optimization | Impact (parallel) | Effort | Status | +|---|-------------|-------------------|--------|--------| +| 1 | PR 492 (LDE cache) | 5-8% (reduces serial) | Done (PR open) | Waiting merge | +| 2 | BLAKE3 hash | ~12% (Keccak is parallel bottleneck) | Low | Not started | +| 3 | Quotient domain eval | 3-5% (constraint eval parallelized already) | Medium | Implemented, stashed | +| 4 | Reduce allocations | 5-8% | Medium | Not started | +| 5 | Parallel FRI fold | ~3% | Low | Implemented, stashed | +| 6 | Monomorphize constraints | 3-5% | High | Not started | + +### Plonky3 degree-3 extension (Option C) +- Would eliminate the last asymmetric variable in the comparison +- Requires implementing `BinomiallyExtendable<3>` for Goldilocks in vendored crate +- Need Sage computation for: `DTH_ROOT = 2^((p-1)/3)`, `EXT_GENERATOR` +- Expected: gap drops from 2.43× to ~1.5-1.7× (confirms extension degree accounts for ~40% of gap) + +--- + +## 9. How to Run + +### M1 / aarch64 (scalar comparison) +```bash +RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3 +``` + +### x86_64 server (scalar comparison, no AVX2) +```bash +cargo bench -p bench-vs-plonky3 +# or explicitly: RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ... +``` + +### With instruments (Lambda phase breakdown) +```bash +# Add "instruments" to stark features in bench_vs_plonky3/Cargo.toml first +cargo bench -p bench-vs-plonky3 --features stark/instruments +``` + +### Verify correctness +```bash +cargo test -p bench-vs-plonky3 # 3 tests +cargo test -p stark --lib # 121 tests +cargo test -p lambda-vm-prover # 326 tests +``` + +--- + +## 10. Key Files Reference + +| File | Purpose | +|------|---------| +| `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` | Lambda AIR (32 cols, 2-row window) | +| `bench_vs_plonky3/src/plonky3_fibonacci.rs` | Plonky3 AIR (matching) | +| `bench_vs_plonky3/src/plonky3_config.rs` | P3 config (FRI params matched) | +| `bench_vs_plonky3/benches/stark_comparison.rs` | Criterion benchmark | +| `bench_vs_plonky3/vendor-p3-goldilocks/` | Patched p3-goldilocks (no NEON) | +| `crypto/stark/src/constraints/evaluator.rs` | Constraint eval loop (bottleneck) | +| `crypto/stark/src/prover.rs` | Prover pipeline (Round 1-4) | +| `crypto/stark/src/fri/fri_functions.rs` | FRI folding | +| `crypto/stark/src/domain.rs` | LDE domain definition | +| `crypto/math/src/fft/polynomial.rs` | FFT / coset_lde_full_expand | diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml new file mode 100644 index 000000000..a3d4e02e2 --- /dev/null +++ b/bench_vs_plonky3/Cargo.toml @@ -0,0 +1,56 @@ +[package] +name = "bench-vs-plonky3" +version = "0.1.0" +edition = "2024" + +[dependencies] +# Lambda STARK +stark = { path = "../crypto/stark", features = ["test-utils"] } +crypto = { path = "../crypto/crypto", features = ["std", "serde"] } +math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"] } + +# Plonky3 (all 0.5.2) +p3-air = "0.5.2" +p3-field = "0.5.2" +p3-goldilocks = "0.5.2" +p3-matrix = "0.5.2" +p3-commit = "0.5.2" +p3-challenger = "0.5.2" +p3-symmetric = "0.5.2" +p3-merkle-tree = "0.5.2" +p3-keccak = "0.5.2" +p3-fri = "0.5.2" +p3-uni-stark = { version = "0.5.2", features = ["parallel"] } +p3-dft = { version = "0.5.2", features = ["parallel"] } + +# Tracing for P3 span-based profiling +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] } + +[dev-dependencies] +criterion = { version = "0.4", default-features = false } + +[features] +# Both provers run multi-threaded by default: Plonky3's `Radix2DitParallel` DFT +# uses rayon unconditionally, so Lambda must also enable `parallel` for a fair +# apples-to-apples comparison. Disable with `--no-default-features` to compare +# single-threaded. +# +# `p3-degree3` (default on) selects the cubic extension for Plonky3's +# Challenge type, matching Lambda's `Degree3GoldilocksExtensionField`. It +# requires the root `[patch.crates-io]` pointing at p3-goldilocks-patched. +# Disable it (`--no-default-features --features parallel`) together with +# commenting the patch block to build against vanilla crates.io +# p3-goldilocks (degree-2 extension). +default = ["parallel", "p3-degree3"] +parallel = ["stark/parallel"] +instruments = ["stark/instruments"] +p3-degree3 = [] + +[[bin]] +name = "prove_bench" +path = "src/bin/prove_bench.rs" + +[[bench]] +name = "stark_comparison" +harness = false diff --git a/bench_vs_plonky3/INSTRUMENTATION.md b/bench_vs_plonky3/INSTRUMENTATION.md new file mode 100644 index 000000000..0d82afe0e --- /dev/null +++ b/bench_vs_plonky3/INSTRUMENTATION.md @@ -0,0 +1,203 @@ +# `bench_vs_plonky3` — puntos de instrumentación + +Guía de referencia para revisores / handoff. Describe **dónde está cada timer +y qué mide** en la comparación Lambda STARK vs Plonky3. No describe el AIR +en sí (eso vive en `ANALYSIS_LOG.md`). + +## Cómo correrlo + +El test que imprime el breakdown se llama `instruments_breakdown`. Hay que +compilar con la feature `instruments` y pasar `--nocapture` porque la salida +va a stdout (si no, `cargo test` se la come). + +**M1 (100% scalar, fairest):** + +```bash +RUSTFLAGS="-C target-feature=-sha3" \ +cargo test -p bench-vs-plonky3 --features instruments --release -- \ + instruments_breakdown --nocapture +``` + +**x86 (Goldilocks scalar, SSE2 Keccak residual en P3):** + +```bash +RUSTFLAGS="-C target-feature=-avx2,-avx512f" \ +cargo test -p bench-vs-plonky3 --features instruments --release -- \ + instruments_breakdown --nocapture +``` + +## Entrada principal + +- Archivo: `bench_vs_plonky3/src/lib.rs` +- Función: `instruments_breakdown` (línea 82) +- AIR Fibonacci fijo: + - `num_sequences = 16` + - `rows = 1 << 18` (2^18) + - columns = 32 (2 por secuencia) + - `blowup_factor = 2` + - `fri_number_of_queries = 219` + - `grinding_factor = 0` + +El test hace dos pasadas independientes: + +1. Corre Lambda STARK con los timers internos del crate `stark` (feature + `instruments`). +2. Corre Plonky3 con un `tracing_subscriber` custom que captura spans. + +## Feature flags + +`bench_vs_plonky3/Cargo.toml` (líneas 33-40): + +```toml +[features] +default = ["parallel"] +parallel = ["stark/parallel"] +instruments = ["stark/instruments"] +``` + +`crypto/stark/Cargo.toml` (líneas 35-41): + +```toml +[features] +instruments = [] # prints de timing en prover/verifier +parallel = ["dep:rayon", "crypto/parallel"] +``` + +`instruments` y `parallel` **coexisten** (no son excluyentes). En la práctica +los benchmarks corren siempre con ambos activos: Plonky3 usa +`Radix2DitParallel` (rayon) unconditionally, así que Lambda también tiene que +correr en paralelo para comparar apples-to-apples. + +## Lambda: estructuras de timing + +`crypto/stark/src/instruments.rs`. + +### `MultiProveTiming` (líneas 40-50) + +Recolectada dentro de `multi_prove` y consumida por el test vía +`stark::instruments::take()`. + +| Campo | Qué mide | +|---|---| +| `prepass` | Construcción de domains + `LdeTwiddles` caches. | +| `main_commits` | Round 1 Phase A: commit de todos los main traces. | +| `aux_build` | Round 1 Phase B: construcción de aux traces / LogUp. | +| `aux_commit` | Round 1 Phase B: LDE + Merkle commit de aux traces. | +| `rounds_2_4` | Tiempo total de Rounds 2-4 (todas las tablas). | +| `round1_sub` | Sub-op breakdown de Round 1 (`Round1SubOps`). | +| `table_timings` | Por tabla: `(name, rows, duration, TableSubOps)`. | + +### `Round1SubOps` (líneas 28-37) + +Sub-ops dentro de Round 1. Se acumulan en `AtomicU64`, así que workers rayon +las pueden incrementar en paralelo sin perder datos. + +| Campo | Qué mide | +|---|---| +| `main_lde` | Main trace: `expand_columns_to_lde` (LDE/FFT). | +| `main_merkle` | Main trace: `commit_columns_bit_reversed` (Merkle). | +| `aux_lde` | Aux trace: `expand_columns_to_lde`. | +| `aux_merkle` | Aux trace: `commit_columns_bit_reversed`. | + +### `TableSubOps` (líneas 7-24) + +Por tabla, dentro de Rounds 2-4. Las partes de R2/R4 se pasan por +thread-locals (`R2_SUB`, `R4_SUB`) y después se ensamblan en +`prove_rounds_2_to_4` (ver más abajo). + +| Campo | Round | Qué mide | +|---|---|---| +| `constraints` | R2 | `evaluator.evaluate()` — constraints sobre dominio LDE. | +| `comp_decompose` | R2 | `decompose_and_extend_d2` — iFFT + extensión del composition poly. | +| `comp_commit` | R2 | Merkle commit del composition poly. | +| `ood` | R3 | Barycentric OOD eval (ver nota sobre dónde se captura). | +| `deep_comp` | R4 | `compute_deep_composition_poly_evaluations`. | +| `deep_extend` | R4 | `interpolate_fft` + `evaluate_fft` para extender el deep comp poly. | +| `fri_commit` | R4 | `fri::commit_phase_from_evaluations` (folds + Merkle layers). | +| `queries` | R4 | Grinding (si hay) + sampling + FRI query phase + Merkle openings. | + +### Dónde se capturan (en `crypto/stark/src/prover.rs`) + +- `multi_prove` (línea 1490): + - `reset_all()` (1502). + - `prepass` timer (1515-1533). + - `main_commits` timer (1541-…). + - `aux_build`, `aux_commit` timers (durante Round 1 Phase B). + - `rounds_2_4` timer; al final: `store(MultiProveTiming)`. +- `round_2_compute_composition_polynomial` — `constraints` / `comp_decompose` / + `comp_commit` (vía `store_r2_sub`). +- `prove_rounds_2_to_4` — **acá** se captura el OOD: + `round_3_dur = t_r3.elapsed()` en líneas 1957-1967, y se guarda en + `TableSubOps.ood` (línea 2010). `round_3_evaluate_polynomials_in_out_of_domain_element` + **no** tiene instrumentación propia. +- `round_4_compute_and_run_fri_on_the_deep_composition_polynomial` — + `deep_comp` / `deep_extend` / `fri_commit` / `queries` + (vía `store_r4_sub`). + +## Plonky3: breakdown por spans + +Todo vive dentro de `instruments_breakdown` en `bench_vs_plonky3/src/lib.rs`, +después del bloque de Lambda. + +- Se define una `P3TimingLayer` custom (líneas 216-259) que implementa + `tracing_subscriber::Layer`: + - `on_new_span` guarda el nombre del span. + - `on_enter` guarda `Instant::now()`. + - `on_close` calcula `start.elapsed()` y lo empuja a un `Vec<(name, ms)>`. +- Se monta un subscriber con `LevelFilter::DEBUG` (línea 266) y se instala + como default **sólo durante el `p3_uni_stark::prove`** (líneas 275-280, + scope con `_guard`). +- Post-prove: orden descendente por duración (287), filtra spans con + `ms >= 0.1` (289), y calcula `(unaccounted) = total − Σspans` (293-301). + +### Qué implica el diseño + +- **La capa no filtra por crate**: captura *cualquier* span DEBUG emitido + mientras el subscriber está vivo. En la práctica sólo corre + `p3_uni_stark::prove` dentro de ese bloque, así que todos los spans que + salen son de Plonky3 — pero si alguien agrega un `#[instrument]` propio + dentro del scope del guard, también se va a contar. +- **No hay instrumentación manual de funciones de Plonky3.** La granularidad + del breakdown = spans que Plonky3 ya emite internamente. +- **Nesting / doble-conteo:** P3 tiene spans anidados (p.ej. + `prove ⊃ compute_quotient_values ⊃ evaluate_constraints`). Cada span se + cuenta una vez con su wall-clock entre `on_enter` y `on_close`, así que + **`Σspans > wall-clock` es esperable, no es un bug**. Consecuencia: + `(unaccounted) = total − Σspans` **puede quedar negativo** en presencia de + nesting — no significa que falte tiempo, significa que los spans padre se + solapan con sus hijos. El código sólo imprime `(unaccounted)` si + `> 1.0ms`, así que casos negativos se silencian. + +## Segunda capa de instrumentación (no la usa `bench_vs_plonky3`) + +Existe una capa adicional en `prover/src/instruments.rs` (líneas 54-211, +`print_report`) — orientada al ejecutor del VM (execute + trace build + AIR +construction) que además re-imprime el `MultiProveTiming` del STARK con +otro formato. `bench_vs_plonky3` **no** la invoca; sólo consume +`stark::instruments::take()` directamente. Vale la pena saberlo si buscás +timings y aparecen en logs distintos. + +## Advertencias para el revisor + +1. Lambda: timing manual, específico del pipeline `multi_prove`. Granularidad + fina pero acoplada al código — moverlo rompe los breakpoints. +2. Plonky3: span-based. Granularidad = la que P3 decida exponer. Si P3 deja + de emitir un span en una versión futura, la línea desaparece del reporte + sin previo aviso. +3. Los porcentajes de Lambda se calculan contra el **total wall-clock del + test** (no contra `rounds_2_4`), así que la suma no cierra al 100% — hay + tiempo fuera de `multi_prove` (construcción de AIR, setup). +4. Los porcentajes de Plonky3 se calculan contra **`p3_prove_dur`** (solo el + `prove`, sin setup). +5. El benchmark usa **degree 3** para la extensión de Plonky3 *sólo* si el + root `Cargo.toml` mantiene: + ```toml + [patch.crates-io] + p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" } + ``` + (línea 26). Sin ese patch, P3 usa la extensión degree 2 de upstream y la + comparación deja de ser fair. +6. Plataforma: + - M1: `RUSTFLAGS="-C target-feature=-sha3"` → scalar en ambos lados. + - x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` → Goldilocks scalar, + residual SSE2 en Keccak de P3 (~7%). diff --git a/bench_vs_plonky3/benches/stark_comparison.rs b/bench_vs_plonky3/benches/stark_comparison.rs new file mode 100644 index 000000000..fd90ae7b5 --- /dev/null +++ b/bench_vs_plonky3/benches/stark_comparison.rs @@ -0,0 +1,190 @@ +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use crypto::fiat_shamir::default_transcript::DefaultTranscript; +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use p3_uni_stark::{prove as p3_prove, verify as p3_verify}; +use stark::proof::options::ProofOptions; +use stark::prover::{IsStarkProver, Prover}; +use stark::verifier::{IsStarkVerifier, Verifier}; + +use bench_vs_plonky3::lambda_fibonacci_pair; +use bench_vs_plonky3::plonky3_config; +use bench_vs_plonky3::plonky3_fibonacci; + +type F = GoldilocksField; +type E = Degree3GoldilocksExtensionField; +type FE = FieldElement; + +/// Number of independent Fibonacci sequences. +const NUM_SEQUENCES: usize = 16; + +/// Rows (same for both Lambda and Plonky3 — identical AIR shape). +/// +/// 2^18 rows × 2 Fibonacci steps packed per row = 2^19 effective Fibonacci +/// steps per sequence, matching Lambda's original `FibonacciMultiColumnAIR` +/// at 2^19 rows × 1 step/row. +const ROWS: usize = 1 << 18; +const TRACE_LABEL: &str = "fib_pair_16seq_2^18"; + +/// Production proof options: blowup=2, 219 queries (from +/// `GoldilocksCubicProofOptions::with_blowup(2)`), grinding=0 (excluded +/// from benchmark — identical PoW work on both sides, not informative). +fn benchmark_proof_options() -> ProofOptions { + ProofOptions { + blowup_factor: 2, + fri_number_of_queries: 219, + coset_offset: 3, + grinding_factor: 0, + } +} + +fn lambda_initial_values() -> Vec<(FE, FE)> { + (0..NUM_SEQUENCES) + .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) + .collect() +} + +fn bench_lambda_prove(c: &mut Criterion) { + let mut group = c.benchmark_group("lambda_stark/prove"); + group.throughput(Throughput::Elements( + (ROWS * 2 * NUM_SEQUENCES) as u64, + )); + let proof_options = benchmark_proof_options(); + + group.bench_with_input( + BenchmarkId::new("fibonacci", TRACE_LABEL), + &ROWS, + |b, &rows| { + b.iter_with_setup( + || { + let initial_values = lambda_initial_values(); + let trace = lambda_fibonacci_pair::compute_trace::( + &initial_values, + rows, + ); + let pub_inputs = + lambda_fibonacci_pair::create_public_inputs(initial_values); + let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( + &proof_options, + NUM_SEQUENCES, + ); + (trace, pub_inputs, air) + }, + |(mut trace, pub_inputs, air)| { + Prover::::prove( + &air, + &mut trace, + &pub_inputs, + &mut DefaultTranscript::::new(&[]), + ) + .unwrap() + }, + ); + }, + ); + group.finish(); +} + +fn bench_plonky3_prove(c: &mut Criterion) { + let mut group = c.benchmark_group("plonky3_stark/prove"); + group.throughput(Throughput::Elements( + (ROWS * 2 * NUM_SEQUENCES) as u64, + )); + + group.bench_with_input( + BenchmarkId::new("fibonacci", TRACE_LABEL), + &ROWS, + |b, &rows| { + b.iter_with_setup( + || { + let config = plonky3_config::matched_params_config(); + let air = plonky3_fibonacci::P3FibonacciAir { + num_sequences: NUM_SEQUENCES, + }; + let trace = + plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, rows); + let pis = plonky3_fibonacci::public_values(NUM_SEQUENCES); + (config, air, trace, pis) + }, + |(config, air, trace, pis)| p3_prove(&config, &air, trace, &pis), + ); + }, + ); + group.finish(); +} + +fn bench_lambda_verify(c: &mut Criterion) { + let mut group = c.benchmark_group("lambda_stark/verify"); + group.throughput(Throughput::Elements( + (ROWS * 2 * NUM_SEQUENCES) as u64, + )); + let proof_options = benchmark_proof_options(); + + let initial_values = lambda_initial_values(); + let mut trace = lambda_fibonacci_pair::compute_trace::(&initial_values, ROWS); + let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values); + let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( + &proof_options, + NUM_SEQUENCES, + ); + let proof = Prover::::prove( + &air, + &mut trace, + &pub_inputs, + &mut DefaultTranscript::::new(&[]), + ) + .unwrap(); + + group.bench_with_input(BenchmarkId::new("fibonacci", TRACE_LABEL), &ROWS, |b, _| { + b.iter(|| { + assert!(Verifier::::verify( + &proof, + &air, + &mut DefaultTranscript::::new(&[]), + )) + }); + }); + group.finish(); +} + +fn bench_plonky3_verify(c: &mut Criterion) { + let mut group = c.benchmark_group("plonky3_stark/verify"); + group.throughput(Throughput::Elements( + (ROWS * 2 * NUM_SEQUENCES) as u64, + )); + + let air = plonky3_fibonacci::P3FibonacciAir { + num_sequences: NUM_SEQUENCES, + }; + let trace = plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, ROWS); + let pis = plonky3_fibonacci::public_values(NUM_SEQUENCES); + let config = plonky3_config::matched_params_config(); + let proof = p3_prove(&config, &air, trace, &pis); + + group.bench_with_input(BenchmarkId::new("fibonacci", TRACE_LABEL), &ROWS, |b, _| { + b.iter(|| { + let config = plonky3_config::matched_params_config(); + p3_verify(&config, &air, &proof, &pis).unwrap(); + }); + }); + group.finish(); +} + +criterion_group! { + name = prove_comparison; + config = Criterion::default() + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(120)); + targets = bench_lambda_prove, bench_plonky3_prove +} + +criterion_group! { + name = verify_comparison; + config = Criterion::default() + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); + targets = bench_lambda_verify, bench_plonky3_verify +} + +criterion_main!(prove_comparison, verify_comparison); diff --git a/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml b/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml new file mode 100644 index 000000000..768a2bb5a --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml @@ -0,0 +1,129 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2024" +name = "p3-goldilocks" +version = "0.5.2" +build = false +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "An implementation of the Goldilocks prime field F_p, where p = 2^64 - 2^32 + 1." +homepage = "https://github.com/Plonky3/Plonky3" +readme = false +keywords = [ + "cryptography", + "SNARK", + "PLONK", + "FRI", + "plonky3", +] +categories = ["cryptography::cryptocurrencies"] +license = "MIT OR Apache-2.0" +repository = "https://github.com/Plonky3/Plonky3" +resolver = "2" + +[lib] +name = "p3_goldilocks" +path = "src/lib.rs" + +[[bench]] +name = "bench_field" +path = "benches/bench_field.rs" +harness = false + +[[bench]] +name = "extension" +path = "benches/extension.rs" +harness = false + +[dependencies.num-bigint] +version = "0.4.6" +default-features = false + +[dependencies.p3-challenger] +version = "0.5.2" + +[dependencies.p3-dft] +version = "0.5.2" + +[dependencies.p3-field] +version = "0.5.2" + +[dependencies.p3-mds] +version = "0.5.2" + +[dependencies.p3-poseidon1] +version = "0.5.2" + +[dependencies.p3-poseidon2] +version = "0.5.2" + +[dependencies.p3-symmetric] +version = "0.5.2" + +[dependencies.p3-util] +version = "0.5.2" + +[dependencies.paste] +version = "1.0.15" + +[dependencies.rand] +version = "0.10.0" +default-features = false + +[dependencies.serde] +version = "1.0" +features = ["derive"] +default-features = false + +[dev-dependencies.criterion] +version = "0.8" + +[dev-dependencies.proptest] +version = "1.10" + +[dev-dependencies.rand] +version = "0.10.0" +default-features = false + +[lints.clippy] +cognitive_complexity = "allow" +match_bool = "warn" +needless_pass_by_value = "warn" +redundant_pub_crate = "allow" +semicolon_if_nothing_returned = "warn" +too_long_first_doc_paragraph = "allow" +transmute_undefined_repr = "allow" +tuple_array_conversions = "allow" +unused_peekable = "allow" + +[lints.clippy.all] +level = "warn" +priority = -1 + +[lints.clippy.nursery] +level = "warn" +priority = -1 + +[lints.rust] +rust_2024_incompatible_pat = "warn" +unused_must_use = "deny" + +[lints.rust.rust_2018_idioms] +level = "deny" +priority = -1 + +[lints.rustdoc] +all = "warn" diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs new file mode 100644 index 000000000..a0d5e05f4 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs @@ -0,0 +1,72 @@ +use core::any::type_name; + +use criterion::{BatchSize, Criterion, criterion_group, criterion_main}; +use p3_field::{Field, PrimeCharacteristicRing}; +use p3_field_testing::bench_func::{ + benchmark_add_latency, benchmark_add_throughput, benchmark_chunked_linear_combination, + benchmark_inv, benchmark_iter_sum, benchmark_sub_latency, benchmark_sub_throughput, +}; +use p3_field_testing::{ + benchmark_dot_array, benchmark_mul_latency, benchmark_mul_throughput, benchmark_sum_array, +}; +use p3_goldilocks::Goldilocks; +use rand::rngs::SmallRng; +use rand::{RngExt, SeedableRng}; + +type F = Goldilocks; + +fn bench_field(c: &mut Criterion) { + let name = "Goldilocks"; + const REPS: usize = 200; + benchmark_mul_latency::(c, name); + benchmark_mul_throughput::(c, name); + benchmark_inv::(c, name); + benchmark_iter_sum::(c, name); + benchmark_sum_array::(c, name); + + benchmark_dot_array::(c, name); + benchmark_dot_array::(c, name); + benchmark_dot_array::(c, name); + benchmark_dot_array::(c, name); + benchmark_dot_array::(c, name); + benchmark_dot_array::(c, name); + + // Note that each round of throughput has 10 operations + // So we should have 10 * more repetitions for latency tests. + const L_REPS: usize = 10 * REPS; + benchmark_add_latency::(c, name); + benchmark_add_throughput::(c, name); + benchmark_sub_latency::(c, name); + benchmark_sub_throughput::(c, name); + + benchmark_chunked_linear_combination::(c, name); + + let mut rng = SmallRng::seed_from_u64(1); + c.bench_function("7th_root", |b| { + b.iter_batched( + || rng.random::(), + |x| x.exp_u64(10540996611094048183), + BatchSize::SmallInput, + ); + }); +} +fn bench_packedfield(c: &mut Criterion) { + let name = type_name::<::Packing>().to_string(); + // Note that each round of throughput has 10 operations + // So we should have 10 * more repetitions for latency tests. + const REPS: usize = 100; + const L_REPS: usize = 10 * REPS; + + benchmark_add_latency::<::Packing, L_REPS>(c, &name); + benchmark_add_throughput::<::Packing, REPS>(c, &name); + benchmark_sub_latency::<::Packing, L_REPS>(c, &name); + benchmark_sub_throughput::<::Packing, REPS>(c, &name); + benchmark_mul_latency::<::Packing, L_REPS>(c, &name); + benchmark_mul_throughput::<::Packing, REPS>(c, &name); + + type PF = ::Packing; + benchmark_chunked_linear_combination::(c, &name); +} + +criterion_group!(goldilocks_arithmetic, bench_field, bench_packedfield); +criterion_main!(goldilocks_arithmetic); diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs new file mode 100644 index 000000000..f4bf7e750 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs @@ -0,0 +1,40 @@ +use criterion::{Criterion, criterion_group, criterion_main}; +use p3_field::extension::BinomialExtensionField; +use p3_field_testing::bench_func::{ + benchmark_inv, benchmark_mul_latency, benchmark_mul_throughput, benchmark_square, +}; +use p3_field_testing::benchmark_mul; +use p3_goldilocks::Goldilocks; + +type EF2 = BinomialExtensionField; +type EF5 = BinomialExtensionField; + +// Note that each round of throughput has 10 operations +// So we should have 10 * more repetitions for latency tests. +const REPS: usize = 50; +const L_REPS: usize = 10 * REPS; + +fn bench_quadratic_extension(c: &mut Criterion) { + let name = "BinomialExtensionField"; + benchmark_square::(c, name); + benchmark_inv::(c, name); + benchmark_mul::(c, name); + benchmark_mul_throughput::(c, name); + benchmark_mul_latency::(c, name); +} + +fn bench_quintic_extension(c: &mut Criterion) { + let name = "BinomialExtensionField"; + benchmark_square::(c, name); + benchmark_inv::(c, name); + benchmark_mul::(c, name); + benchmark_mul_throughput::(c, name); + benchmark_mul_latency::(c, name); +} + +criterion_group!( + bench_goldilocks_ef, + bench_quadratic_extension, + bench_quintic_extension +); +criterion_main!(bench_goldilocks_ef); diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs new file mode 100644 index 000000000..9d4b410d3 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs @@ -0,0 +1,343 @@ +//! MDS permutation for Goldilocks on aarch64. + +use core::arch::aarch64::*; +use core::mem::transmute; + +use p3_mds::MdsPermutation; +use p3_symmetric::Permutation; + +use super::packing::PackedGoldilocksNeon; +use super::utils::{pack_lanes, unpack_lanes}; +use crate::{Goldilocks, MdsMatrixGoldilocks}; + +// --------------------------------------------------------------------------- +// Packed MdsMatrixGoldilocks (delegates to scalar Karatsuba per lane) +// --------------------------------------------------------------------------- + +/// Apply the scalar MDS to each lane of a packed NEON state independently. +#[inline] +fn mds_packed( + mds: &MdsMatrixGoldilocks, + input: &mut [PackedGoldilocksNeon; WIDTH], +) where + MdsMatrixGoldilocks: Permutation<[Goldilocks; WIDTH]>, +{ + let (mut lane0, mut lane1) = unpack_lanes(input); + unsafe { + mds.permute_mut(&mut *(&mut lane0 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH])); + mds.permute_mut(&mut *(&mut lane1 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH])); + } + pack_lanes(input, &lane0, &lane1); +} + +impl Permutation<[PackedGoldilocksNeon; 8]> for MdsMatrixGoldilocks { + fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 8]) { + mds_packed(self, input); + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +impl Permutation<[PackedGoldilocksNeon; 12]> for MdsMatrixGoldilocks { + fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 12]) { + mds_packed(self, input); + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +// --------------------------------------------------------------------------- +// NEON-accelerated circulant MDS (16-bit chunk multiply-accumulate) +// --------------------------------------------------------------------------- + +/// Goldilocks identity: `2^64 ≡ 2^32 − 1 (mod P)`. +const EPSILON_U32: u32 = 0xffffffff; + +/// Reduce two accumulated 4×32-bit chunk vectors back to Goldilocks field +/// elements. Each `uint32x4_t` holds four 32-bit accumulators representing +/// the four 16-bit chunks of a Goldilocks element: +/// +/// ```text +/// elem = c[0] + c[1]·2¹⁶ + c[2]·2³² + c[3]·2⁴⁸ +/// ``` +/// +/// Returns two Goldilocks values packed in a `uint64x2_t`. +/// +/// Ported from plonky2. +#[inline(always)] +unsafe fn mds_reduce([cumul_a, cumul_b]: [uint32x4_t; 2]) -> uint64x2_t { + unsafe { + let mut lo = vreinterpretq_u64_u32(vuzp1q_u32(cumul_a, cumul_b)); + let mut hi = vreinterpretq_u64_u32(vuzp2q_u32(cumul_a, cumul_b)); + + hi = vsraq_n_u64::<16>(hi, lo); + lo = vsliq_n_u64::<16>(lo, hi); + + let top = { + let hi_u8 = vreinterpretq_u8_u64(hi); + let top_idx = + transmute::<[u8; 8], uint8x8_t>([0x06, 0x07, 0xff, 0xff, 0x0e, 0x0f, 0xff, 0xff]); + let top_u8 = vqtbl1_u8(hi_u8, top_idx); + vreinterpret_u32_u8(top_u8) + }; + + let adj_lo = vmlal_n_u32(lo, top, EPSILON_U32); + let wraparound_mask = vcgtq_u64(lo, adj_lo); + vsraq_n_u64::<32>(adj_lo, wraparound_mask) + } +} + +/// NEON-accelerated width-8 circulant MDS. +/// +/// Circulant first row: `[7, 1, 3, 8, 8, 3, 4, 9]` +/// (matches `MATRIX_CIRC_MDS_8_SML_ROW`). +#[inline(always)] +pub unsafe fn mds_neon_w8(state: &[u64; 8]) -> [u64; 8] { + unsafe { + const ROW: [u32; 8] = [7, 1, 3, 8, 8, 3, 4, 9]; + + const M: [[u32; 8]; 8] = { + let mut m = [[0u32; 8]; 8]; + let mut i = 0; + while i < 8 { + let mut j = 0; + while j < 8 { + m[i][j] = ROW[(j + 8 - i) % 8]; + j += 1; + } + i += 1; + } + m + }; + + let c: [uint32x4_t; 8] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i]))); + + let mut res = [0u64; 8]; + + let mut pair = 0; + while pair < 4 { + let i0 = 2 * pair; + let i1 = i0 + 1; + + let mut a0 = vdupq_n_u32(0); + let mut a1 = vdupq_n_u32(0); + + let mut j = 0; + while j < 8 { + a0 = vmlaq_n_u32(a0, c[j], M[i0][j]); + a1 = vmlaq_n_u32(a1, c[j], M[i1][j]); + j += 1; + } + + let r = mds_reduce([a0, a1]); + res[i0] = vgetq_lane_u64::<0>(r); + res[i1] = vgetq_lane_u64::<1>(r); + pair += 1; + } + + res + } +} + +/// NEON-accelerated width-12 circulant MDS. +/// +/// Circulant first row: `[1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]` +/// (matches `MATRIX_CIRC_MDS_12_SML_ROW`). +#[inline(always)] +pub unsafe fn mds_neon_w12(state: &[u64; 12]) -> [u64; 12] { + unsafe { + const ROW: [u32; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]; + + const M: [[u32; 12]; 12] = { + let mut m = [[0u32; 12]; 12]; + let mut i = 0; + while i < 12 { + let mut j = 0; + while j < 12 { + m[i][j] = ROW[(j + 12 - i) % 12]; + j += 1; + } + i += 1; + } + m + }; + + let c: [uint32x4_t; 12] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i]))); + + let mut res = [0u64; 12]; + + let mut pair = 0; + while pair < 6 { + let i0 = 2 * pair; + let i1 = i0 + 1; + + let mut a0 = vdupq_n_u32(0); + let mut a1 = vdupq_n_u32(0); + + let mut j = 0; + while j < 12 { + a0 = vmlaq_n_u32(a0, c[j], M[i0][j]); + a1 = vmlaq_n_u32(a1, c[j], M[i1][j]); + j += 1; + } + + let r = mds_reduce([a0, a1]); + res[i0] = vgetq_lane_u64::<0>(r); + res[i1] = vgetq_lane_u64::<1>(r); + pair += 1; + } + + res + } +} + +/// NEON-accelerated MDS wrapper for use with the generic Poseidon1. +/// +/// Zero-sized type that implements `Permutation<[Goldilocks; 8]>` and +/// `Permutation<[Goldilocks; 12]>` using the NEON chunk technique. Plugs +/// into `Poseidon1ExternalLayerGeneric` to accelerate full-round MDS while +/// keeping LLVM-optimized partial rounds from the generic Poseidon1. +#[derive(Clone, Debug, Default)] +pub struct MdsNeonGoldilocks; + +impl Permutation<[Goldilocks; 8]> for MdsNeonGoldilocks { + fn permute_mut(&self, state: &mut [Goldilocks; 8]) { + let raw = unsafe { &*(state as *const [Goldilocks; 8] as *const [u64; 8]) }; + let result = unsafe { mds_neon_w8(raw) }; + *unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) } = result; + } +} + +impl Permutation<[Goldilocks; 12]> for MdsNeonGoldilocks { + fn permute_mut(&self, state: &mut [Goldilocks; 12]) { + let raw = unsafe { &*(state as *const [Goldilocks; 12] as *const [u64; 12]) }; + let result = unsafe { mds_neon_w12(raw) }; + *unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) } = result; + } +} + +#[cfg(test)] +mod tests { + use p3_field::PrimeField64; + use p3_symmetric::Permutation; + use rand::rngs::SmallRng; + use rand::{RngExt, SeedableRng}; + + use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksNeon}; + + type F = Goldilocks; + + // -- Packed MdsMatrixGoldilocks tests -- + + macro_rules! test_neon_mds { + ($name:ident, $width:literal) => { + #[test] + fn $name() { + let mut rng = SmallRng::seed_from_u64(1); + let mds = MdsMatrixGoldilocks; + + let input: [Goldilocks; $width] = rng.random(); + let expected = mds.permute(input); + + let packed_input = input.map(Into::::into); + let packed_output = mds.permute(packed_input); + + let neon_output = packed_output.map(|x| x.0[0]); + assert_eq!(neon_output, expected); + } + }; + } + + test_neon_mds!(test_neon_mds_width_8, 8); + test_neon_mds!(test_neon_mds_width_12, 12); + + // -- NEON MDS correctness tests -- + + #[test] + fn test_mds_neon_w8_matches_karatsuba() { + let mds = MdsMatrixGoldilocks; + let mut rng = SmallRng::seed_from_u64(42); + + for _ in 0..100 { + let input: [F; 8] = rng.random(); + let expected = mds.permute(input); + + let raw: [u64; 8] = input.map(|x| x.as_canonical_u64()); + let result = unsafe { super::mds_neon_w8(&raw) }; + + for i in 0..8 { + assert_eq!( + F::new(result[i]).as_canonical_u64(), + expected[i].as_canonical_u64(), + "NEON MDS w8 mismatch at index {i}" + ); + } + } + } + + #[test] + fn test_mds_neon_w12_matches_karatsuba() { + let mds = MdsMatrixGoldilocks; + let mut rng = SmallRng::seed_from_u64(43); + + for _ in 0..100 { + let input: [F; 12] = rng.random(); + let expected = mds.permute(input); + + let raw: [u64; 12] = input.map(|x| x.as_canonical_u64()); + let result = unsafe { super::mds_neon_w12(&raw) }; + + for i in 0..12 { + assert_eq!( + F::new(result[i]).as_canonical_u64(), + expected[i].as_canonical_u64(), + "NEON MDS w12 mismatch at index {i}" + ); + } + } + } + + #[test] + fn test_mds_neon_boundary_w8() { + let mds = MdsMatrixGoldilocks; + let p_minus_1 = F::ORDER_U64 - 1; + + for &val in &[0u64, 1, p_minus_1] { + let input: [F; 8] = [F::new(val); 8]; + let expected = mds.permute(input); + + let raw = [val; 8]; + let result = unsafe { super::mds_neon_w8(&raw) }; + + for i in 0..8 { + assert_eq!( + F::new(result[i]).as_canonical_u64(), + expected[i].as_canonical_u64(), + "NEON MDS w8 boundary mismatch at index {i} for value {val}" + ); + } + } + } + + #[test] + fn test_mds_neon_boundary_w12() { + let mds = MdsMatrixGoldilocks; + let p_minus_1 = F::ORDER_U64 - 1; + + for &val in &[0u64, 1, p_minus_1] { + let input: [F; 12] = [F::new(val); 12]; + let expected = mds.permute(input); + + let raw = [val; 12]; + let result = unsafe { super::mds_neon_w12(&raw) }; + + for i in 0..12 { + assert_eq!( + F::new(result[i]).as_canonical_u64(), + expected[i].as_canonical_u64(), + "NEON MDS w12 boundary mismatch at index {i} for value {val}" + ); + } + } + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs new file mode 100644 index 000000000..82516a6cf --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs @@ -0,0 +1,12 @@ +mod mds; +mod packing; +mod poseidon1; +mod poseidon1_asm; +mod poseidon2; +mod poseidon2_asm; +mod utils; + +pub use mds::MdsNeonGoldilocks; +pub use packing::*; +pub use poseidon1::*; +pub use poseidon2::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs new file mode 100644 index 000000000..f393c3b65 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs @@ -0,0 +1,404 @@ +use alloc::vec::Vec; +use core::arch::aarch64::{ + uint64x2_t, vaddq_u64, vandq_u64, vbicq_u64, vcgtq_s64, vdupq_n_u64, veorq_u64, vgetq_lane_u64, + vreinterpretq_s64_u64, vsetq_lane_u64, vshrq_n_u64, vsubq_u64, +}; +use core::fmt::Debug; +use core::iter::{Product, Sum}; +use core::mem::transmute; +use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; + +use p3_field::exponentiation::exp_10540996611094048183; +use p3_field::op_assign_macros::{ + impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, + impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, + ring_sum, +}; +use p3_field::{ + Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, + PermutationMonomial, PrimeCharacteristicRing, PrimeField64, +}; +use p3_util::reconstitute_from_base; +use rand::distr::{Distribution, StandardUniform}; +use rand::{Rng, RngExt}; + +use crate::{Goldilocks, P}; + +const WIDTH: usize = 2; + +/// Equal to `2^32 - 1 = 2^64 mod P`. +const EPSILON: u64 = Goldilocks::ORDER_U64.wrapping_neg(); + +/// Vectorized NEON implementation of `Goldilocks` arithmetic. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +#[repr(transparent)] +#[must_use] +pub struct PackedGoldilocksNeon(pub [Goldilocks; WIDTH]); + +impl PackedGoldilocksNeon { + #[inline] + #[must_use] + pub(crate) fn to_vector(self) -> uint64x2_t { + unsafe { transmute(self) } + } + + #[inline] + pub(crate) fn from_vector(vector: uint64x2_t) -> Self { + unsafe { transmute(vector) } + } + + #[inline] + const fn broadcast(value: Goldilocks) -> Self { + Self([value; WIDTH]) + } +} + +impl From for PackedGoldilocksNeon { + fn from(x: Goldilocks) -> Self { + Self::broadcast(x) + } +} + +impl Add for PackedGoldilocksNeon { + type Output = Self; + #[inline] + fn add(self, rhs: Self) -> Self { + Self::from_vector(add(self.to_vector(), rhs.to_vector())) + } +} + +impl Sub for PackedGoldilocksNeon { + type Output = Self; + #[inline] + fn sub(self, rhs: Self) -> Self { + Self::from_vector(sub(self.to_vector(), rhs.to_vector())) + } +} + +impl Neg for PackedGoldilocksNeon { + type Output = Self; + #[inline] + fn neg(self) -> Self { + Self::from_vector(neg(self.to_vector())) + } +} + +impl Mul for PackedGoldilocksNeon { + type Output = Self; + #[inline] + fn mul(self, rhs: Self) -> Self { + Self::from_vector(mul(self.to_vector(), rhs.to_vector())) + } +} + +impl_add_assign!(PackedGoldilocksNeon); +impl_sub_assign!(PackedGoldilocksNeon); +impl_mul_methods!(PackedGoldilocksNeon); +ring_sum!(PackedGoldilocksNeon); +impl_rng!(PackedGoldilocksNeon); + +impl PrimeCharacteristicRing for PackedGoldilocksNeon { + type PrimeSubfield = Goldilocks; + + const ZERO: Self = Self::broadcast(Goldilocks::ZERO); + const ONE: Self = Self::broadcast(Goldilocks::ONE); + const TWO: Self = Self::broadcast(Goldilocks::TWO); + const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE); + + #[inline] + fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { + f.into() + } + + #[inline] + fn halve(&self) -> Self { + Self::from_vector(halve(self.to_vector())) + } + + #[inline] + fn square(&self) -> Self { + Self::from_vector(square(self.to_vector())) + } + + #[inline] + fn zero_vec(len: usize) -> Vec { + unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) } + } +} + +impl InjectiveMonomial<7> for PackedGoldilocksNeon {} + +impl PermutationMonomial<7> for PackedGoldilocksNeon { + fn injective_exp_root_n(&self) -> Self { + exp_10540996611094048183(*self) + } +} + +impl_add_base_field!(PackedGoldilocksNeon, Goldilocks); +impl_sub_base_field!(PackedGoldilocksNeon, Goldilocks); +impl_mul_base_field!(PackedGoldilocksNeon, Goldilocks); +impl_div_methods!(PackedGoldilocksNeon, Goldilocks); +impl_sum_prod_base_field!(PackedGoldilocksNeon, Goldilocks); + +impl Algebra for PackedGoldilocksNeon { + // Benchmarked on AArch64 NEON: chunk=2 ≈ 182ns, chunk=4 ≈ 198ns, chunk=8 ≈ 221ns. + const BATCHED_LC_CHUNK: usize = 2; +} + +impl_packed_value!(PackedGoldilocksNeon, Goldilocks, WIDTH); + +unsafe impl PackedField for PackedGoldilocksNeon { + type Scalar = Goldilocks; +} + +/// Interleave two 64-bit vectors at the element level. +/// For block_len=1: [a0, a1] x [b0, b1] -> [a0, b0], [a1, b1] +#[inline] +pub fn interleave_u64(v0: uint64x2_t, v1: uint64x2_t) -> (uint64x2_t, uint64x2_t) { + unsafe { + let a0 = vgetq_lane_u64::<0>(v0); + let a1 = vgetq_lane_u64::<1>(v0); + let b0 = vgetq_lane_u64::<0>(v1); + let b1 = vgetq_lane_u64::<1>(v1); + + // r0 = [a0, b0], r1 = [a1, b1] + let r0 = vsetq_lane_u64::<1>(b0, vsetq_lane_u64::<0>(a0, vdupq_n_u64(0))); + let r1 = vsetq_lane_u64::<1>(b1, vsetq_lane_u64::<0>(a1, vdupq_n_u64(0))); + + (r0, r1) + } +} + +unsafe impl PackedFieldPow2 for PackedGoldilocksNeon { + fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) { + let (v0, v1) = (self.to_vector(), other.to_vector()); + let (res0, res1) = match block_len { + 1 => interleave_u64(v0, v1), + 2 => (v0, v1), + _ => panic!("unsupported block length"), + }; + (Self::from_vector(res0), Self::from_vector(res1)) + } +} + +// NEON arithmetic uses shifted representation (XOR with 2^63) for unsigned comparison. + +const SIGN_BIT: uint64x2_t = unsafe { transmute([i64::MIN as u64; WIDTH]) }; +const SHIFTED_FIELD_ORDER: uint64x2_t = + unsafe { transmute([Goldilocks::ORDER_U64 ^ (i64::MIN as u64); WIDTH]) }; +const EPSILON_VEC: uint64x2_t = unsafe { transmute([EPSILON; WIDTH]) }; + +#[inline(always)] +fn shift(x: uint64x2_t) -> uint64x2_t { + unsafe { veorq_u64(x, SIGN_BIT) } +} + +#[inline(always)] +unsafe fn canonicalize_s(x_s: uint64x2_t) -> uint64x2_t { + unsafe { + let x_s_signed = vreinterpretq_s64_u64(x_s); + let order_s_signed = vreinterpretq_s64_u64(SHIFTED_FIELD_ORDER); + let mask = vcgtq_s64(order_s_signed, x_s_signed); + let wrapback_amt = vbicq_u64(EPSILON_VEC, mask); + vaddq_u64(x_s, wrapback_amt) + } +} + +#[inline(always)] +unsafe fn add_no_double_overflow_64_64s_s(x: uint64x2_t, y_s: uint64x2_t) -> uint64x2_t { + unsafe { + let res_wrapped_s = vaddq_u64(x, y_s); + // After XOR shift, signed comparison correctly detects overflow. + // Overflow occurred iff y_s > res_wrapped_s (as signed, due to shift semantics) + let y_s_signed = vreinterpretq_s64_u64(y_s); + let res_s_signed = vreinterpretq_s64_u64(res_wrapped_s); + let mask = vcgtq_s64(y_s_signed, res_s_signed); + // wrapback_amt is EPSILON on overflow + let wrapback_amt = vshrq_n_u64::<32>(mask); + vaddq_u64(res_wrapped_s, wrapback_amt) + } +} + +/// Goldilocks modular addition. +#[inline] +fn add(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t { + unsafe { + let y_s = shift(y); + let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s)); + shift(res_s) + } +} + +/// Goldilocks modular subtraction. +#[inline] +fn sub(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t { + unsafe { + let mut y_s = shift(y); + y_s = canonicalize_s(y_s); + let x_s = shift(x); + let y_s_signed = vreinterpretq_s64_u64(y_s); + let x_s_signed = vreinterpretq_s64_u64(x_s); + // -1 if underflow (y > x) + let mask = vcgtq_s64(y_s_signed, x_s_signed); + let wrapback_amt = vshrq_n_u64::<32>(mask); + let res_wrapped = vsubq_u64(x_s, y_s); + vsubq_u64(res_wrapped, wrapback_amt) + } +} + +/// Goldilocks modular negation. +#[inline] +fn neg(y: uint64x2_t) -> uint64x2_t { + unsafe { + let y_s = shift(y); + vsubq_u64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s)) + } +} + +/// Halve a vector of Goldilocks field elements. +#[inline(always)] +pub(crate) fn halve(input: uint64x2_t) -> uint64x2_t { + unsafe { + let one = vdupq_n_u64(1); + let zero = vdupq_n_u64(0); + let half = vdupq_n_u64(P.div_ceil(2)); + + let least_bit = vandq_u64(input, one); + let t = vshrq_n_u64::<1>(input); + // neg_least_bit is 0 or -1 (all bits 1) + let neg_least_bit = vsubq_u64(zero, least_bit); + let maybe_half = vandq_u64(half, neg_least_bit); + vaddq_u64(t, maybe_half) + } +} + +/// Goldilocks modular multiplication using interleaved dual-lane ASM. +#[inline] +fn mul(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t { + unsafe { + let x0 = vgetq_lane_u64::<0>(x); + let x1 = vgetq_lane_u64::<1>(x); + let y0 = vgetq_lane_u64::<0>(y); + let y1 = vgetq_lane_u64::<1>(y); + + let (res_0, res_1) = mul_reduce_dual_asm(x0, y0, x1, y1); + + transmute([res_0, res_1]) + } +} + +/// Interleaved dual-lane multiplication and reduction using scalar ASM. +/// Uses shift-based EPSILON multiplication: hi_lo * EPSILON = (hi_lo << 32) - hi_lo +#[inline(always)] +unsafe fn mul_reduce_dual_asm(a0: u64, b0: u64, a1: u64, b1: u64) -> (u64, u64) { + use core::arch::asm; + let result0: u64; + let result1: u64; + + unsafe { + asm!( + // Compute both 128-bit products (interleaved for ILP) + "mul {lo0}, {a0}, {b0}", + "mul {lo1}, {a1}, {b1}", + "umulh {hi0}, {a0}, {b0}", + "umulh {hi1}, {a1}, {b1}", + + // hi_hi = hi >> 32 + "lsr {hi_hi0}, {hi0}, #32", + "lsr {hi_hi1}, {hi1}, #32", + + // tmp = lo - hi_hi (with borrow handling) + "subs {tmp0}, {lo0}, {hi_hi0}", + "csetm {adj0:w}, cc", + "subs {tmp1}, {lo1}, {hi_hi1}", + "csetm {adj1:w}, cc", + "sub {tmp0}, {tmp0}, {adj0}", + "sub {tmp1}, {tmp1}, {adj1}", + + // hi_lo = hi & EPSILON + "and {hi_lo0}, {hi0}, {epsilon}", + "and {hi_lo1}, {hi1}, {epsilon}", + + // hi_lo_eps = (hi_lo << 32) - hi_lo (avoids multiply) + "lsl {t0}, {hi_lo0}, #32", + "lsl {t1}, {hi_lo1}, #32", + "sub {hi_lo_eps0}, {t0}, {hi_lo0}", + "sub {hi_lo_eps1}, {t1}, {hi_lo1}", + + // result = tmp + hi_lo_eps (with overflow handling) + "adds {result0}, {tmp0}, {hi_lo_eps0}", + "csetm {adj0:w}, cs", + "adds {result1}, {tmp1}, {hi_lo_eps1}", + "csetm {adj1:w}, cs", + "add {result0}, {result0}, {adj0}", + "add {result1}, {result1}, {adj1}", + + a0 = in(reg) a0, + b0 = in(reg) b0, + a1 = in(reg) a1, + b1 = in(reg) b1, + epsilon = in(reg) EPSILON, + lo0 = out(reg) _, + lo1 = out(reg) _, + hi0 = out(reg) _, + hi1 = out(reg) _, + hi_hi0 = out(reg) _, + hi_hi1 = out(reg) _, + tmp0 = out(reg) _, + tmp1 = out(reg) _, + hi_lo0 = out(reg) _, + hi_lo1 = out(reg) _, + t0 = out(reg) _, + t1 = out(reg) _, + hi_lo_eps0 = out(reg) _, + hi_lo_eps1 = out(reg) _, + adj0 = out(reg) _, + adj1 = out(reg) _, + result0 = out(reg) result0, + result1 = out(reg) result1, + options(pure, nomem, nostack), + ); + } + + (result0, result1) +} + +/// Goldilocks modular square using interleaved dual-lane ASM. +#[inline] +fn square(x: uint64x2_t) -> uint64x2_t { + unsafe { + let x0 = vgetq_lane_u64::<0>(x); + let x1 = vgetq_lane_u64::<1>(x); + + let (res_0, res_1) = mul_reduce_dual_asm(x0, x0, x1, x1); + + transmute([res_0, res_1]) + } +} + +#[cfg(test)] +mod tests { + use p3_field_testing::test_packed_field; + + use super::{Goldilocks, PackedGoldilocksNeon, WIDTH}; + + const SPECIAL_VALS: [Goldilocks; WIDTH] = + Goldilocks::new_array([0xFFFF_FFFF_0000_0000, 0xFFFF_FFFF_FFFF_FFFF]); + + const ZEROS: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([ + 0x0000_0000_0000_0000, + 0xFFFF_FFFF_0000_0001, // = P, canonicalizes to 0 + ])); + + const ONES: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([ + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0002, // = P + 1, canonicalizes to 1 + ])); + + test_packed_field!( + crate::PackedGoldilocksNeon, + &[super::ZEROS], + &[super::ONES], + crate::PackedGoldilocksNeon(super::SPECIAL_VALS) + ); +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs new file mode 100644 index 000000000..0a877578a --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs @@ -0,0 +1,716 @@ +//! Fused Poseidon1 permutation for Goldilocks on aarch64. + +use alloc::vec::Vec; + +use p3_poseidon1::{ + FullRoundConstants, PartialRoundConstants, full_round_initial_permute_state, + full_round_terminal_permute_state, partial_permute_state, +}; +use p3_symmetric::{CryptographicPermutation, Permutation}; + +use super::mds::{MdsNeonGoldilocks, mds_neon_w8, mds_neon_w12}; +use super::packing::PackedGoldilocksNeon; +use super::poseidon1_asm::*; +use super::poseidon2_asm::{sbox_layer_asm, sbox_layer_dual_asm}; +use super::utils::{pack_lanes, unpack_lanes}; +use crate::Goldilocks; + +/// Fused Poseidon1 permutation for Goldilocks. +/// +/// Holds the pre-extracted raw `u64` constants from the optimized Poseidon1 +/// sparse-matrix decomposition. Storing raw values avoids field-element +/// overhead in the hot inner loop. +#[derive(Clone, Debug)] +pub struct Poseidon1GoldilocksFused { + /// Round constants for the initial full rounds (RF/2 vectors). + initial_constants_raw: Vec<[u64; WIDTH]>, + /// Round constants for the terminal full rounds (RF/2 vectors). + terminal_constants_raw: Vec<[u64; WIDTH]>, + /// Full-width constant vector for the first partial round. + first_round_constants_raw: [u64; WIDTH], + /// Dense transition matrix applied once before entering the partial-round loop. + m_i_raw: [[u64; WIDTH]; WIDTH], + /// Per-round first row of the sparse matrix (one per partial round). + sparse_first_row_raw: Vec<[u64; WIDTH]>, + /// Per-round sub-diagonal vector for the sparse matmul (one per partial round). + v_raw: Vec<[u64; WIDTH]>, + /// Scalar round constants for partial rounds 0 through RP-2. + /// + /// The last partial round has no scalar constant (it ends with the S-box only). + round_constants_raw: Vec, +} + +impl Poseidon1GoldilocksFused { + /// Create from pre-computed full and partial round constants. + /// + /// Extracts the raw `u64` representation from each Goldilocks field + /// element, building the flat arrays that the ASM kernels consume. + pub fn new( + full: &FullRoundConstants, + partial: &PartialRoundConstants, + ) -> Self { + // Extract raw u64 values from full-round constant matrices. + let initial_constants_raw = full + .initial + .iter() + .map(|rc| core::array::from_fn(|i| rc[i].value)) + .collect(); + let terminal_constants_raw = full + .terminal + .iter() + .map(|rc| core::array::from_fn(|i| rc[i].value)) + .collect(); + + // Extract the first partial-round constant vector. + let first_round_constants_raw = + core::array::from_fn(|i| partial.first_round_constants[i].value); + + // Extract the dense transition matrix. + let m_i_raw = core::array::from_fn(|i| core::array::from_fn(|j| partial.m_i[i][j].value)); + + // Extract per-round sparse matrix data. + let sparse_first_row_raw = partial + .sparse_first_row + .iter() + .map(|r| core::array::from_fn(|i| r[i].value)) + .collect(); + let v_raw = partial + .v + .iter() + .map(|r| core::array::from_fn(|i| r[i].value)) + .collect(); + + // Extract scalar round constants for partial rounds. + let round_constants_raw = partial.round_constants.iter().map(|c| c.value).collect(); + + Self { + initial_constants_raw, + terminal_constants_raw, + first_round_constants_raw, + m_i_raw, + sparse_first_row_raw, + v_raw, + round_constants_raw, + } + } +} + +/// Run the initial or terminal full rounds on a raw width-8 state. +/// +/// Each full round applies: add constants, S-box on all elements, NEON MDS. +#[inline] +fn full_rounds_scalar_w8(raw: &mut [u64; 8], constants: &[[u64; 8]]) { + for rc in constants { + unsafe { + add_rc_asm(raw, rc); + sbox_layer_asm(raw); + } + *raw = unsafe { mds_neon_w8(raw) }; + } +} + +/// Run the initial or terminal full rounds on a raw width-12 state. +/// +/// Each full round applies: add constants, S-box on all elements, NEON MDS. +#[inline] +fn full_rounds_scalar_w12(raw: &mut [u64; 12], constants: &[[u64; 12]]) { + for rc in constants { + unsafe { + add_rc_asm(raw, rc); + sbox_layer_asm(raw); + } + *raw = unsafe { mds_neon_w12(raw) }; + } +} + +/// Run all partial rounds on a raw width-8 state. +/// +/// The partial-round sequence is: +/// 1. Add the first-round full-width constant vector. +/// 2. Apply the dense transition matrix once. +/// 3. For each partial round (except the last): +/// S-box on first element, add scalar constant, sparse matmul. +/// 4. Last partial round: S-box on first element, sparse matmul (no constant). +#[inline] +fn partial_rounds_scalar_w8( + raw: &mut [u64; 8], + first_rc: &[u64; 8], + m_i: &[[u64; 8]; 8], + sparse_first_row: &[[u64; 8]], + v: &[[u64; 8]], + round_constants: &[u64], +) { + // Add the first-round full-width constant vector. + unsafe { + add_rc_asm(raw, first_rc); + } + + // Apply the dense transition matrix once. + dense_matmul_asm_w8(raw, m_i); + + // Main partial-round loop: S-box + scalar constant + sparse matmul. + let rounds_p = sparse_first_row.len(); + for r in 0..rounds_p - 1 { + unsafe { + sbox_s0_asm(raw); + add_scalar_s0_asm(raw, round_constants[r]); + cheap_matmul_asm_w8(raw, &sparse_first_row[r], &v[r]); + } + } + + // Last partial round: no scalar constant. + unsafe { + sbox_s0_asm(raw); + cheap_matmul_asm_w8(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]); + } +} + +/// Run all partial rounds on a raw width-12 state. +/// +/// Same structure as the width-8 variant. +#[inline] +fn partial_rounds_scalar_w12( + raw: &mut [u64; 12], + first_rc: &[u64; 12], + m_i: &[[u64; 12]; 12], + sparse_first_row: &[[u64; 12]], + v: &[[u64; 12]], + round_constants: &[u64], +) { + unsafe { + add_rc_asm(raw, first_rc); + } + dense_matmul_asm_w12(raw, m_i); + + let rounds_p = sparse_first_row.len(); + for r in 0..rounds_p - 1 { + unsafe { + sbox_s0_asm(raw); + add_scalar_s0_asm(raw, round_constants[r]); + cheap_matmul_asm_w12(raw, &sparse_first_row[r], &v[r]); + } + } + unsafe { + sbox_s0_asm(raw); + cheap_matmul_asm_w12(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]); + } +} + +/// Run the initial or terminal full rounds on two raw width-8 lanes. +/// +/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane. +#[inline] +fn full_rounds_dual_w8(lane0: &mut [u64; 8], lane1: &mut [u64; 8], constants: &[[u64; 8]]) { + for rc in constants { + unsafe { + add_rc_dual_asm(lane0, lane1, rc); + sbox_layer_dual_asm(lane0, lane1); + } + *lane0 = unsafe { mds_neon_w8(lane0) }; + *lane1 = unsafe { mds_neon_w8(lane1) }; + } +} + +/// Run the initial or terminal full rounds on two raw width-12 lanes. +/// +/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane. +#[inline] +fn full_rounds_dual_w12(lane0: &mut [u64; 12], lane1: &mut [u64; 12], constants: &[[u64; 12]]) { + for rc in constants { + unsafe { + add_rc_dual_asm(lane0, lane1, rc); + sbox_layer_dual_asm(lane0, lane1); + } + *lane0 = unsafe { mds_neon_w12(lane0) }; + *lane1 = unsafe { mds_neon_w12(lane1) }; + } +} + +/// Run all partial rounds on two width-8 lanes simultaneously. +/// +/// Uses dual-lane S-box and sparse matmul primitives to keep the +/// pipeline full. The scalar constant is added to each lane separately +/// (no dual variant needed for a single-element addition). +#[inline] +fn partial_rounds_dual_w8( + lane0: &mut [u64; 8], + lane1: &mut [u64; 8], + first_rc: &[u64; 8], + m_i: &[[u64; 8]; 8], + sparse_first_row: &[[u64; 8]], + v: &[[u64; 8]], + round_constants: &[u64], +) { + // Add the first-round constant to both lanes. + unsafe { + add_rc_dual_asm(lane0, lane1, first_rc); + } + + // Dense transition matrix on both lanes. + dense_matmul_dual_asm_w8(lane0, lane1, m_i); + + // Main partial-round loop. + let rounds_p = sparse_first_row.len(); + for r in 0..rounds_p - 1 { + unsafe { + sbox_s0_dual_asm(lane0, lane1); + add_scalar_s0_asm(lane0, round_constants[r]); + add_scalar_s0_asm(lane1, round_constants[r]); + cheap_matmul_dual_asm_w8(lane0, lane1, &sparse_first_row[r], &v[r]); + } + } + + // Last partial round: no scalar constant. + unsafe { + sbox_s0_dual_asm(lane0, lane1); + cheap_matmul_dual_asm_w8( + lane0, + lane1, + &sparse_first_row[rounds_p - 1], + &v[rounds_p - 1], + ); + } +} + +/// Run all partial rounds on two width-12 lanes simultaneously. +/// +/// Same structure as the width-8 dual variant. +#[inline] +fn partial_rounds_dual_w12( + lane0: &mut [u64; 12], + lane1: &mut [u64; 12], + first_rc: &[u64; 12], + m_i: &[[u64; 12]; 12], + sparse_first_row: &[[u64; 12]], + v: &[[u64; 12]], + round_constants: &[u64], +) { + unsafe { + add_rc_dual_asm(lane0, lane1, first_rc); + } + dense_matmul_dual_asm_w12(lane0, lane1, m_i); + + let rounds_p = sparse_first_row.len(); + for r in 0..rounds_p - 1 { + unsafe { + sbox_s0_dual_asm(lane0, lane1); + add_scalar_s0_asm(lane0, round_constants[r]); + add_scalar_s0_asm(lane1, round_constants[r]); + cheap_matmul_dual_asm_w12(lane0, lane1, &sparse_first_row[r], &v[r]); + } + } + unsafe { + sbox_s0_dual_asm(lane0, lane1); + cheap_matmul_dual_asm_w12( + lane0, + lane1, + &sparse_first_row[rounds_p - 1], + &v[rounds_p - 1], + ); + } +} + +impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> { + fn permute_mut(&self, state: &mut [Goldilocks; 8]) { + // Zero-cost transmute: Goldilocks is repr(transparent) over u64. + let raw = unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; + + // Initial full rounds, then partial rounds, then terminal full rounds. + full_rounds_scalar_w8(raw, &self.initial_constants_raw); + partial_rounds_scalar_w8( + raw, + &self.first_round_constants_raw, + &self.m_i_raw, + &self.sparse_first_row_raw, + &self.v_raw, + &self.round_constants_raw, + ); + full_rounds_scalar_w8(raw, &self.terminal_constants_raw); + } +} + +impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> {} + +impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) { + // Unpack the two lanes from the packed representation. + let (mut lane0, mut lane1) = unpack_lanes(state); + + // Run the full permutation on both lanes simultaneously. + full_rounds_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw); + partial_rounds_dual_w8( + &mut lane0, + &mut lane1, + &self.first_round_constants_raw, + &self.m_i_raw, + &self.sparse_first_row_raw, + &self.v_raw, + &self.round_constants_raw, + ); + full_rounds_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw); + + // Repack both lanes into the packed representation. + pack_lanes(state, &lane0, &lane1); + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> {} + +impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> { + fn permute_mut(&self, state: &mut [Goldilocks; 12]) { + let raw = unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; + + full_rounds_scalar_w12(raw, &self.initial_constants_raw); + partial_rounds_scalar_w12( + raw, + &self.first_round_constants_raw, + &self.m_i_raw, + &self.sparse_first_row_raw, + &self.v_raw, + &self.round_constants_raw, + ); + full_rounds_scalar_w12(raw, &self.terminal_constants_raw); + } +} + +impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> {} + +impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + + full_rounds_dual_w12(&mut lane0, &mut lane1, &self.initial_constants_raw); + partial_rounds_dual_w12( + &mut lane0, + &mut lane1, + &self.first_round_constants_raw, + &self.m_i_raw, + &self.sparse_first_row_raw, + &self.v_raw, + &self.round_constants_raw, + ); + full_rounds_dual_w12(&mut lane0, &mut lane1, &self.terminal_constants_raw); + + pack_lanes(state, &lane0, &lane1); + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> {} + +/// Dual-dispatch wrapper for Goldilocks Poseidon1. +/// +/// **Scalar** permutations use the NEON-accelerated MDS for full rounds +/// and LLVM-optimized sparse matrix decomposition for partial rounds. +/// This avoids sequential inline ASM that would prevent LLVM's +/// instruction scheduling optimizations on wide out-of-order cores. +/// +/// **Packed** permutations delegate to the fused dual-lane ASM path +/// with NEON MDS for full rounds and sparse matrix for partial rounds +/// (dual-lane interleaving hides multiply latency). +#[derive(Clone, Debug)] +pub struct Poseidon1GoldilocksDispatch { + /// Fused dual-lane path — used for packed permutations. + fused: Poseidon1GoldilocksFused, + /// Pre-computed full round constants for NEON MDS. + full_constants: FullRoundConstants, + /// Pre-computed partial round constants (textbook path for scalar, sparse for packed). + partial_constants: PartialRoundConstants, +} + +impl Poseidon1GoldilocksDispatch { + /// Create from fused and pre-computed constants. + pub const fn new( + fused: Poseidon1GoldilocksFused, + full_constants: FullRoundConstants, + partial_constants: PartialRoundConstants, + ) -> Self { + Self { + fused, + full_constants, + partial_constants, + } + } +} + +// --- Width 8 --- + +impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> { + fn permute_mut(&self, state: &mut [Goldilocks; 8]) { + let mds = MdsNeonGoldilocks; + full_round_initial_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds); + partial_permute_state::<_, _, 8, 7>(state, &self.partial_constants); + full_round_terminal_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds); + } +} + +impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> {} + +impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) { + self.fused.permute_mut(state); + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> {} + +// --- Width 12 --- + +impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> { + fn permute_mut(&self, state: &mut [Goldilocks; 12]) { + let mds = MdsNeonGoldilocks; + full_round_initial_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds); + partial_permute_state::<_, _, 12, 7>(state, &self.partial_constants); + full_round_terminal_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds); + } +} + +impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> {} + +impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) { + // Extract both lanes, run the optimized scalar path on each, repack. + // Directly inline the scalar logic (NEON MDS full rounds + sparse partial + // rounds) to avoid trait-dispatch overhead and enable cross-call inlining. + let mut lane0: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[0]); + let mut lane1: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[1]); + + let mds = MdsNeonGoldilocks; + full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds); + partial_permute_state::<_, _, 12, 7>(&mut lane0, &self.partial_constants); + full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds); + + full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds); + partial_permute_state::<_, _, 12, 7>(&mut lane1, &self.partial_constants); + full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds); + + for i in 0..12 { + state[i] = PackedGoldilocksNeon([lane0[i], lane1[i]]); + } + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> {} + +#[cfg(test)] +mod tests { + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + use p3_poseidon1::Poseidon1Constants; + use p3_symmetric::Permutation; + use rand::rngs::SmallRng; + use rand::{RngExt, SeedableRng}; + + use super::*; + use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL}; + use crate::poseidon1::{ + GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, GOLDILOCKS_POSEIDON1_RC_8, + GOLDILOCKS_POSEIDON1_RC_12, default_goldilocks_poseidon1_8, + default_goldilocks_poseidon1_12, + }; + + type F = Goldilocks; + + /// Build a width-8 fused permutation from the fixed round constants. + fn make_fused_w8() -> Poseidon1GoldilocksFused<8> { + let raw = Poseidon1Constants { + rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + mds_circ_col: MATRIX_CIRC_MDS_8_COL, + round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(), + }; + let (full, partial) = raw.to_optimized(); + Poseidon1GoldilocksFused::new(&full, &partial) + } + + /// Build a width-12 fused permutation from the fixed round constants. + fn make_fused_w12() -> Poseidon1GoldilocksFused<12> { + let raw = Poseidon1Constants { + rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, + mds_circ_col: MATRIX_CIRC_MDS_12_COL, + round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(), + }; + let (full, partial) = raw.to_optimized(); + Poseidon1GoldilocksFused::new(&full, &partial) + } + + /// Verify that the fused width-8 implementation matches the generic one + /// on both zero and random inputs. + #[test] + fn test_fused_matches_generic_w8() { + let generic = default_goldilocks_poseidon1_8(); + let fused = make_fused_w8(); + let mut rng = SmallRng::seed_from_u64(42); + + // Zero input. + let mut g_state = [F::ZERO; 8]; + let mut f_state = [F::ZERO; 8]; + generic.permute_mut(&mut g_state); + fused.permute_mut(&mut f_state); + for i in 0..8 { + assert_eq!( + f_state[i].as_canonical_u64(), + g_state[i].as_canonical_u64(), + "Fused vs generic mismatch at index {i} (zero input, w8)" + ); + } + + // Random input. + let mut g_state: [F; 8] = rng.random(); + let mut f_state = g_state; + generic.permute_mut(&mut g_state); + fused.permute_mut(&mut f_state); + for i in 0..8 { + assert_eq!( + f_state[i].as_canonical_u64(), + g_state[i].as_canonical_u64(), + "Fused vs generic mismatch at index {i} (random input, w8)" + ); + } + } + + /// Same fused-vs-generic verification for width 12. + #[test] + fn test_fused_matches_generic_w12() { + let generic = default_goldilocks_poseidon1_12(); + let fused = make_fused_w12(); + let mut rng = SmallRng::seed_from_u64(42); + + let mut g_state = [F::ZERO; 12]; + let mut f_state = [F::ZERO; 12]; + generic.permute_mut(&mut g_state); + fused.permute_mut(&mut f_state); + for i in 0..12 { + assert_eq!( + f_state[i].as_canonical_u64(), + g_state[i].as_canonical_u64(), + "Fused vs generic mismatch at index {i} (zero input, w12)" + ); + } + + let mut g_state: [F; 12] = rng.random(); + let mut f_state = g_state; + generic.permute_mut(&mut g_state); + fused.permute_mut(&mut f_state); + for i in 0..12 { + assert_eq!( + f_state[i].as_canonical_u64(), + g_state[i].as_canonical_u64(), + "Fused vs generic mismatch at index {i} (random input, w12)" + ); + } + } + + /// Verify that the packed (dual-lane) width-8 path matches running + /// two independent scalar permutations. + #[test] + fn test_packed_matches_scalar_w8() { + let fused = make_fused_w8(); + let mut rng = SmallRng::seed_from_u64(123); + + // Two independent random scalar inputs. + let scalar_a: [F; 8] = rng.random(); + let scalar_b: [F; 8] = rng.random(); + + // Pack them into a single packed state and permute. + let mut packed: [PackedGoldilocksNeon; 8] = + core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]])); + fused.permute_mut(&mut packed); + + // Compute the expected result by running scalar on each independently. + let mut expected_a = scalar_a; + let mut expected_b = scalar_b; + fused.permute_mut(&mut expected_a); + fused.permute_mut(&mut expected_b); + + // Lane 0 must match the first scalar, lane 1 must match the second. + for i in 0..8 { + assert_eq!( + packed[i].0[0].as_canonical_u64(), + expected_a[i].as_canonical_u64(), + "Packed lane0 mismatch at index {i} (w8)" + ); + assert_eq!( + packed[i].0[1].as_canonical_u64(), + expected_b[i].as_canonical_u64(), + "Packed lane1 mismatch at index {i} (w8)" + ); + } + } + + /// Same packed-vs-scalar verification for width 12. + #[test] + fn test_packed_matches_scalar_w12() { + let fused = make_fused_w12(); + let mut rng = SmallRng::seed_from_u64(123); + + let scalar_a: [F; 12] = rng.random(); + let scalar_b: [F; 12] = rng.random(); + + let mut packed: [PackedGoldilocksNeon; 12] = + core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]])); + fused.permute_mut(&mut packed); + + let mut expected_a = scalar_a; + let mut expected_b = scalar_b; + fused.permute_mut(&mut expected_a); + fused.permute_mut(&mut expected_b); + + for i in 0..12 { + assert_eq!( + packed[i].0[0].as_canonical_u64(), + expected_a[i].as_canonical_u64(), + "Packed lane0 mismatch at index {i} (w12)" + ); + assert_eq!( + packed[i].0[1].as_canonical_u64(), + expected_b[i].as_canonical_u64(), + "Packed lane1 mismatch at index {i} (w12)" + ); + } + } + + /// Known-answer test for width 8 (sequential 0..7 input). + #[test] + fn test_fused_kat_w8() { + let fused = make_fused_w8(); + let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]); + fused.permute_mut(&mut input); + + let expected: [F; 8] = F::new_array([ + 2431226948502761687, + 9427563026145807618, + 6827549936272051660, + 16907684411084503785, + 10131745626715172913, + 17448305483431576765, + 9066501914269485014, + 12095238468458521303, + ]); + assert_eq!(input, expected); + } + + /// Known-answer test for width 12 (sequential 0..11 input). + #[test] + fn test_fused_kat_w12() { + let fused = make_fused_w12(); + let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + fused.permute_mut(&mut input); + + let expected: [F; 12] = F::new_array([ + 15595088881848875364, + 9564850329150784619, + 13607005230761744521, + 12117102595842533385, + 2814257411756993122, + 11640647689983397089, + 14363867760831937423, + 13323891071259596526, + 11219803511311150468, + 9221595262780869902, + 5898229059046891887, + 18181291031484020550, + ]); + assert_eq!(input, expected); + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs new file mode 100644 index 000000000..3ca1382a9 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs @@ -0,0 +1,843 @@ +//! ARM assembly primitives for the Poseidon1 permutation over Goldilocks. + +use super::utils::{add_asm, mul_add_asm, mul_asm}; + +// --------------------------------------------------------------------------- +// S-box: x -> x^7 (applied to the first element only) +// --------------------------------------------------------------------------- + +/// Apply the degree-7 S-box to the first element of the state. +/// +/// Computes `x^7` using four multiplications via the addition chain: +/// +/// ```text +/// x -> x^2 -> x^3 (= x^2 * x) +/// x^4 (= x^2 * x^2) +/// x^7 (= x^3 * x^4) +/// ``` +/// +/// Only the first element is modified. All other elements are unchanged. +/// This corresponds to the non-linear step of a **partial round**. +#[inline(always)] +pub unsafe fn sbox_s0_asm(state: &mut [u64]) { + unsafe { + // Load the first element. + let s0 = state[0]; + + // Square: x^2. + let s0_2 = mul_asm(s0, s0); + + // Cube: x^3 = x^2 * x. + let s0_3 = mul_asm(s0_2, s0); + + // Fourth power: x^4 = x^2 * x^2. + let s0_4 = mul_asm(s0_2, s0_2); + + // Seventh power: x^7 = x^3 * x^4. + state[0] = mul_asm(s0_3, s0_4); + } +} + +/// Dual-lane S-box on the first element of two independent states. +/// +/// Applies the same degree-7 S-box to both first elements. Interleaving +/// the two chains hides the multiplication latency: while one multiply +/// retires, the other is already in flight. +#[inline(always)] +pub unsafe fn sbox_s0_dual_asm(state0: &mut [u64], state1: &mut [u64]) { + unsafe { + // Load both first elements. + let a = state0[0]; + let b = state1[0]; + + // Square both. + let a2 = mul_asm(a, a); + let b2 = mul_asm(b, b); + + // Cube both: x^3 = x^2 * x. + let a3 = mul_asm(a2, a); + let b3 = mul_asm(b2, b); + + // Fourth power both: x^4 = x^2 * x^2. + let a4 = mul_asm(a2, a2); + let b4 = mul_asm(b2, b2); + + // Seventh power both: x^7 = x^3 * x^4. + state0[0] = mul_asm(a3, a4); + state1[0] = mul_asm(b3, b4); + } +} + +// --------------------------------------------------------------------------- +// Sparse matrix-vector multiply (partial-round linear layer) +// --------------------------------------------------------------------------- + +/// Sparse matrix-vector multiply for a width-8 state. +/// +/// Implements the partial-round linear layer. The sparse matrix is +/// encoded as its first row and a sub-diagonal vector: +/// +/// ```text +/// new[0] = dot(first_row, state) (dot product) +/// new[i] = state[i] + state[0] * v[i-1] (for i >= 1) +/// ``` +/// +/// The original first element is captured before the dot product +/// overwrites it. The unrolled form avoids loop overhead and gives +/// the scheduler maximum freedom to reorder independent multiply-adds. +#[inline(always)] +pub unsafe fn cheap_matmul_asm_w8(state: &mut [u64; 8], first_row: &[u64; 8], v: &[u64; 8]) { + unsafe { + // Capture the original first element before it gets overwritten. + let old_s0 = state[0]; + + // Dot product: accumulate dot(first_row, state). + let mut acc = mul_asm(state[0], first_row[0]); + acc = mul_add_asm(state[1], first_row[1], acc); + acc = mul_add_asm(state[2], first_row[2], acc); + acc = mul_add_asm(state[3], first_row[3], acc); + acc = mul_add_asm(state[4], first_row[4], acc); + acc = mul_add_asm(state[5], first_row[5], acc); + acc = mul_add_asm(state[6], first_row[6], acc); + acc = mul_add_asm(state[7], first_row[7], acc); + + // Tail update: each remaining element gets old_first * v[i-1] added. + state[1] = mul_add_asm(old_s0, v[0], state[1]); + state[2] = mul_add_asm(old_s0, v[1], state[2]); + state[3] = mul_add_asm(old_s0, v[2], state[3]); + state[4] = mul_add_asm(old_s0, v[3], state[4]); + state[5] = mul_add_asm(old_s0, v[4], state[5]); + state[6] = mul_add_asm(old_s0, v[5], state[6]); + state[7] = mul_add_asm(old_s0, v[6], state[7]); + + // Write the dot-product result into the first slot. + state[0] = acc; + } +} + +/// Sparse matrix-vector multiply for a width-12 state. +/// +/// Same decomposition as the width-8 variant: +/// - Dot product for the new first element. +/// - Scalar multiply-add for every other element. +#[inline(always)] +pub unsafe fn cheap_matmul_asm_w12(state: &mut [u64; 12], first_row: &[u64; 12], v: &[u64; 12]) { + unsafe { + // Capture the original first element before it gets overwritten. + let old_s0 = state[0]; + + // Dot product: accumulate dot(first_row, state). + let mut acc = mul_asm(state[0], first_row[0]); + acc = mul_add_asm(state[1], first_row[1], acc); + acc = mul_add_asm(state[2], first_row[2], acc); + acc = mul_add_asm(state[3], first_row[3], acc); + acc = mul_add_asm(state[4], first_row[4], acc); + acc = mul_add_asm(state[5], first_row[5], acc); + acc = mul_add_asm(state[6], first_row[6], acc); + acc = mul_add_asm(state[7], first_row[7], acc); + acc = mul_add_asm(state[8], first_row[8], acc); + acc = mul_add_asm(state[9], first_row[9], acc); + acc = mul_add_asm(state[10], first_row[10], acc); + acc = mul_add_asm(state[11], first_row[11], acc); + + // Tail update: each remaining element gets old_first * v[i-1] added. + state[1] = mul_add_asm(old_s0, v[0], state[1]); + state[2] = mul_add_asm(old_s0, v[1], state[2]); + state[3] = mul_add_asm(old_s0, v[2], state[3]); + state[4] = mul_add_asm(old_s0, v[3], state[4]); + state[5] = mul_add_asm(old_s0, v[4], state[5]); + state[6] = mul_add_asm(old_s0, v[5], state[6]); + state[7] = mul_add_asm(old_s0, v[6], state[7]); + state[8] = mul_add_asm(old_s0, v[7], state[8]); + state[9] = mul_add_asm(old_s0, v[8], state[9]); + state[10] = mul_add_asm(old_s0, v[9], state[10]); + state[11] = mul_add_asm(old_s0, v[10], state[11]); + + // Write the dot-product result into the first slot. + state[0] = acc; + } +} + +/// Dual-lane sparse matrix-vector multiply for a width-8 state. +/// +/// Processes two independent states through the same sparse matrix +/// simultaneously. Both lanes share the same first-row and sub-diagonal +/// vectors, since the matrix is fixed for a given partial round. +/// +/// Interleaving multiply-adds from both lanes keeps the pipeline full. +#[inline(always)] +pub unsafe fn cheap_matmul_dual_asm_w8( + s0: &mut [u64; 8], + s1: &mut [u64; 8], + first_row: &[u64; 8], + v: &[u64; 8], +) { + unsafe { + // Capture the original first elements from both lanes. + let old_a = s0[0]; + let old_b = s1[0]; + + // Dot products: one per lane, interleaved. + let mut acc_a = mul_asm(s0[0], first_row[0]); + let mut acc_b = mul_asm(s1[0], first_row[0]); + acc_a = mul_add_asm(s0[1], first_row[1], acc_a); + acc_b = mul_add_asm(s1[1], first_row[1], acc_b); + acc_a = mul_add_asm(s0[2], first_row[2], acc_a); + acc_b = mul_add_asm(s1[2], first_row[2], acc_b); + acc_a = mul_add_asm(s0[3], first_row[3], acc_a); + acc_b = mul_add_asm(s1[3], first_row[3], acc_b); + acc_a = mul_add_asm(s0[4], first_row[4], acc_a); + acc_b = mul_add_asm(s1[4], first_row[4], acc_b); + acc_a = mul_add_asm(s0[5], first_row[5], acc_a); + acc_b = mul_add_asm(s1[5], first_row[5], acc_b); + acc_a = mul_add_asm(s0[6], first_row[6], acc_a); + acc_b = mul_add_asm(s1[6], first_row[6], acc_b); + acc_a = mul_add_asm(s0[7], first_row[7], acc_a); + acc_b = mul_add_asm(s1[7], first_row[7], acc_b); + + // Tail updates: both lanes, interleaved. + s0[1] = mul_add_asm(old_a, v[0], s0[1]); + s1[1] = mul_add_asm(old_b, v[0], s1[1]); + s0[2] = mul_add_asm(old_a, v[1], s0[2]); + s1[2] = mul_add_asm(old_b, v[1], s1[2]); + s0[3] = mul_add_asm(old_a, v[2], s0[3]); + s1[3] = mul_add_asm(old_b, v[2], s1[3]); + s0[4] = mul_add_asm(old_a, v[3], s0[4]); + s1[4] = mul_add_asm(old_b, v[3], s1[4]); + s0[5] = mul_add_asm(old_a, v[4], s0[5]); + s1[5] = mul_add_asm(old_b, v[4], s1[5]); + s0[6] = mul_add_asm(old_a, v[5], s0[6]); + s1[6] = mul_add_asm(old_b, v[5], s1[6]); + s0[7] = mul_add_asm(old_a, v[6], s0[7]); + s1[7] = mul_add_asm(old_b, v[6], s1[7]); + + // Write the dot-product results into the first slots. + s0[0] = acc_a; + s1[0] = acc_b; + } +} + +/// Dual-lane sparse matrix-vector multiply for a width-12 state. +/// +/// Same as the width-8 dual variant but with 12-element states. +/// Uses loops instead of full unrolling since width 12 is large +/// enough that code size matters more than marginal scheduling gains. +#[inline(always)] +pub unsafe fn cheap_matmul_dual_asm_w12( + s0: &mut [u64; 12], + s1: &mut [u64; 12], + first_row: &[u64; 12], + v: &[u64; 12], +) { + unsafe { + // Capture the original first elements from both lanes. + let old_a = s0[0]; + let old_b = s1[0]; + + // Dot products: one per lane, interleaved. + let mut acc_a = mul_asm(s0[0], first_row[0]); + let mut acc_b = mul_asm(s1[0], first_row[0]); + for i in 1..12 { + acc_a = mul_add_asm(s0[i], first_row[i], acc_a); + acc_b = mul_add_asm(s1[i], first_row[i], acc_b); + } + + // Tail updates: both lanes. + for i in 1..12 { + s0[i] = mul_add_asm(old_a, v[i - 1], s0[i]); + s1[i] = mul_add_asm(old_b, v[i - 1], s1[i]); + } + + // Write the dot-product results into the first slots. + s0[0] = acc_a; + s1[0] = acc_b; + } +} + +// --------------------------------------------------------------------------- +// Dense matrix-vector multiply (full-round linear layer) +// --------------------------------------------------------------------------- + +/// Dense matrix-vector multiply for a width-8 state. +/// +/// Computes `state = M * state` where M is a full 8x8 MDS matrix +/// stored in row-major order. Used in the **full rounds** of the +/// permutation where every element is mixed with every other. +/// +/// Each output element is the dot product of one matrix row with the +/// input vector. The input is snapshotted before any writes occur. +pub fn dense_matmul_asm_w8(state: &mut [u64; 8], m: &[[u64; 8]; 8]) { + unsafe { + // Snapshot the current state so reads are not clobbered by writes. + let input = *state; + + // Compute each output element as a dot product of one matrix + // row with the snapshotted input. + for i in 0..8 { + let mut acc = mul_asm(input[0], m[i][0]); + for j in 1..8 { + acc = mul_add_asm(input[j], m[i][j], acc); + } + state[i] = acc; + } + } +} + +/// Dense matrix-vector multiply for a width-12 state. +/// +/// Same as the width-8 variant but with a 12×12 MDS matrix. +pub fn dense_matmul_asm_w12(state: &mut [u64; 12], m: &[[u64; 12]; 12]) { + unsafe { + // Snapshot the current state. + let input = *state; + + // One dot product per output element. + for i in 0..12 { + let mut acc = mul_asm(input[0], m[i][0]); + for j in 1..12 { + acc = mul_add_asm(input[j], m[i][j], acc); + } + state[i] = acc; + } + } +} + +/// Dual-lane dense matrix-vector multiply for a width-8 state. +/// +/// Multiplies two independent state vectors by the same 8×8 matrix. +/// Both lanes share the matrix but have their own input and output. +/// +/// Interleaving the two dot-product chains per row hides latency. +pub fn dense_matmul_dual_asm_w8(s0: &mut [u64; 8], s1: &mut [u64; 8], m: &[[u64; 8]; 8]) { + unsafe { + // Snapshot both input vectors. + let in0 = *s0; + let in1 = *s1; + + // For each row, compute both dot products in lockstep. + for i in 0..8 { + let mut a = mul_asm(in0[0], m[i][0]); + let mut b = mul_asm(in1[0], m[i][0]); + for j in 1..8 { + a = mul_add_asm(in0[j], m[i][j], a); + b = mul_add_asm(in1[j], m[i][j], b); + } + s0[i] = a; + s1[i] = b; + } + } +} + +/// Dual-lane dense matrix-vector multiply for a width-12 state. +/// +/// Same as the width-8 dual variant but with a 12×12 matrix. +pub fn dense_matmul_dual_asm_w12(s0: &mut [u64; 12], s1: &mut [u64; 12], m: &[[u64; 12]; 12]) { + unsafe { + // Snapshot both input vectors. + let in0 = *s0; + let in1 = *s1; + + // For each row, compute both dot products in lockstep. + for i in 0..12 { + let mut a = mul_asm(in0[0], m[i][0]); + let mut b = mul_asm(in1[0], m[i][0]); + for j in 1..12 { + a = mul_add_asm(in0[j], m[i][j], a); + b = mul_add_asm(in1[j], m[i][j], b); + } + s0[i] = a; + s1[i] = b; + } + } +} + +// --------------------------------------------------------------------------- +// Round-constant addition +// --------------------------------------------------------------------------- + +/// Add round constants to every element of the state. +/// +/// This is the first step of every Poseidon1 round. Each element +/// receives its own constant, added in the Goldilocks field. +/// +/// Generic over the state width to work with both width-8 and width-12. +#[inline(always)] +pub unsafe fn add_rc_asm(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) { + unsafe { + // Element-wise modular addition. + for i in 0..WIDTH { + state[i] = add_asm(state[i], rc[i]); + } + } +} + +/// Dual-lane round-constant addition. +/// +/// Adds the same constants to two independent states. Both lanes +/// share the constants because they are at the same round position. +#[inline(always)] +pub unsafe fn add_rc_dual_asm( + s0: &mut [u64; WIDTH], + s1: &mut [u64; WIDTH], + rc: &[u64; WIDTH], +) { + unsafe { + // Both lanes receive the same constant at each position. + for i in 0..WIDTH { + s0[i] = add_asm(s0[i], rc[i]); + s1[i] = add_asm(s1[i], rc[i]); + } + } +} + +/// Add a single round constant to the first element only. +/// +/// Used in partial rounds where only the first element enters the +/// S-box and thus only needs its own constant added. +#[inline(always)] +pub unsafe fn add_scalar_s0_asm(state: &mut [u64], rc: u64) { + unsafe { + // Only the first element is modified. + state[0] = add_asm(state[0], rc); + } +} + +#[cfg(test)] +mod tests { + use p3_field::PrimeField64; + use proptest::prelude::*; + use rand::SeedableRng; + use rand::rngs::SmallRng; + + use super::*; + use crate::Goldilocks; + + type F = Goldilocks; + + /// Reduce a raw `u64` to its canonical Goldilocks representative. + /// + /// Wraps the value into a field element and extracts the unique + /// representative in `[0, P)`. This is the single source of truth + /// for comparing ASM outputs (which may carry unreduced values) + /// against field-level references. + fn canon(x: u64) -> u64 { + F::new(x).as_canonical_u64() + } + + proptest! { + // ================================================================ + // S-box: first element raised to the 7th power + // ================================================================ + + /// Verify the single-lane S-box against a field-level reference. + /// + /// The reference computes x^7 step by step using field multiplication. + /// Only the first element should change; the rest must be untouched. + #[test] + fn test_sbox_s0_asm(vals in prop::array::uniform8(any::())) { + // Build the expected x^7 using the field multiplication chain. + let x = F::new(vals[0]); + let x2 = x * x; + let x3 = x2 * x; + let x4 = x2 * x2; + let expected_s0 = (x3 * x4).as_canonical_u64(); + + // Run the ASM version on a copy. + let mut state = vals; + unsafe { sbox_s0_asm(&mut state); } + + // The first element must match x^7. + prop_assert_eq!(canon(state[0]), expected_s0); + + // Every other element must be unchanged. + for i in 1..8 { + prop_assert_eq!(state[i], vals[i]); + } + } + + /// Verify the dual-lane S-box matches two independent single-lane calls. + /// + /// Runs the single-lane version on each lane separately as the + /// reference, then checks the dual-lane version produces the same. + #[test] + fn test_sbox_s0_dual_asm( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + ) { + // Build the reference by running single-lane on each lane. + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + sbox_s0_asm(&mut ref0); + sbox_s0_asm(&mut ref1); + } + + // Run the dual-lane version. + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { sbox_s0_dual_asm(&mut s0, &mut s1); } + + // Both first elements must match their reference. + prop_assert_eq!(canon(s0[0]), canon(ref0[0])); + prop_assert_eq!(canon(s1[0]), canon(ref1[0])); + + // All other elements must be unchanged. + for i in 1..8 { + prop_assert_eq!(s0[i], vals0[i]); + prop_assert_eq!(s1[i], vals1[i]); + } + } + + // ================================================================ + // Round-constant addition: element-wise field addition + // ================================================================ + + /// Verify round-constant addition (width 8) against field addition. + /// + /// Each element should equal the field sum of the original value + /// and its corresponding round constant. + #[test] + fn test_add_rc_asm_w8( + vals in prop::array::uniform8(any::()), + rc in prop::array::uniform8(any::()), + ) { + // Build the expected result using field addition. + let expected: [u64; 8] = core::array::from_fn(|i| { + (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64() + }); + + // Run the ASM version. + let mut state = vals; + unsafe { add_rc_asm(&mut state, &rc); } + + // Every element must match. + for i in 0..8 { + prop_assert_eq!(canon(state[i]), expected[i]); + } + } + + /// Same verification for width 12. + #[test] + fn test_add_rc_asm_w12( + vals in prop::array::uniform12(any::()), + rc in prop::array::uniform12(any::()), + ) { + let expected: [u64; 12] = core::array::from_fn(|i| { + (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64() + }); + + let mut state = vals; + unsafe { add_rc_asm(&mut state, &rc); } + + for i in 0..12 { + prop_assert_eq!(canon(state[i]), expected[i]); + } + } + + /// Verify dual-lane round-constant addition (width 8) matches + /// two independent single-lane calls. + #[test] + fn test_add_rc_dual_asm_w8( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + rc in prop::array::uniform8(any::()), + ) { + // Reference: single-lane on each independently. + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + add_rc_asm(&mut ref0, &rc); + add_rc_asm(&mut ref1, &rc); + } + + // Run the dual-lane version. + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); } + + // Both lanes must match their references. + for i in 0..8 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + + /// Same dual-lane verification for width 12. + #[test] + fn test_add_rc_dual_asm_w12( + vals0 in prop::array::uniform12(any::()), + vals1 in prop::array::uniform12(any::()), + rc in prop::array::uniform12(any::()), + ) { + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + add_rc_asm(&mut ref0, &rc); + add_rc_asm(&mut ref1, &rc); + } + + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); } + + for i in 0..12 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + + // ================================================================ + // Scalar addition: first element only + // ================================================================ + + /// Verify that adding a scalar to the first element matches + /// field addition, and that all other elements are untouched. + #[test] + fn test_add_scalar_s0_asm(vals in prop::array::uniform8(any::()), rc: u64) { + // Expected: field sum of the first element and the constant. + let expected_s0 = (F::new(vals[0]) + F::new(rc)).as_canonical_u64(); + + // Run the ASM version. + let mut state = vals; + unsafe { add_scalar_s0_asm(&mut state, rc); } + + // The first element must match. + prop_assert_eq!(canon(state[0]), expected_s0); + + // Every other element must be unchanged. + for i in 1..8 { + prop_assert_eq!(state[i], vals[i]); + } + } + + // ================================================================ + // Sparse matrix-vector multiply (partial-round linear layer) + // + // The sparse matrix decomposes into: + // new[0] = dot(first_row, state) + // new[i] = state[i] + state[0] * v[i-1] for i >= 1 + // ================================================================ + + /// Verify the width-8 sparse matmul against a field-level reference. + /// + /// Builds the expected result by computing the dot product and + /// the per-element multiply-add using Goldilocks field operations. + #[test] + fn test_cheap_matmul_asm_w8( + vals in prop::array::uniform8(any::()), + first_row in prop::array::uniform8(any::()), + v in prop::array::uniform8(any::()), + ) { + // Lift raw values into field elements. + let f: [F; 8] = vals.map(F::new); + let fr: [F; 8] = first_row.map(F::new); + let fv: [F; 8] = v.map(F::new); + + // Capture the original first element. + let old_s0 = f[0]; + + // Dot product for the new first element. + let new_s0: F = (0..8).map(|i| f[i] * fr[i]).sum(); + + // Tail update for elements 1..8. + let mut expected = f; + for i in 1..8 { + expected[i] = f[i] + old_s0 * fv[i - 1]; + } + expected[0] = new_s0; + + // Run the ASM version. + let mut state = vals; + unsafe { cheap_matmul_asm_w8(&mut state, &first_row, &v); } + + // Every element must match. + for i in 0..8 { + prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); + } + } + + /// Same verification for width 12. + #[test] + fn test_cheap_matmul_asm_w12( + vals in prop::array::uniform12(any::()), + first_row in prop::array::uniform12(any::()), + v in prop::array::uniform12(any::()), + ) { + let f: [F; 12] = vals.map(F::new); + let fr: [F; 12] = first_row.map(F::new); + let fv: [F; 12] = v.map(F::new); + + let old_s0 = f[0]; + let new_s0: F = (0..12).map(|i| f[i] * fr[i]).sum(); + + let mut expected = f; + for i in 1..12 { + expected[i] = f[i] + old_s0 * fv[i - 1]; + } + expected[0] = new_s0; + + let mut state = vals; + unsafe { cheap_matmul_asm_w12(&mut state, &first_row, &v); } + + for i in 0..12 { + prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); + } + } + + /// Verify the width-8 dual-lane sparse matmul matches two + /// independent single-lane calls. + #[test] + fn test_cheap_matmul_dual_asm_w8( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + first_row in prop::array::uniform8(any::()), + v in prop::array::uniform8(any::()), + ) { + // Reference: single-lane on each independently. + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + cheap_matmul_asm_w8(&mut ref0, &first_row, &v); + cheap_matmul_asm_w8(&mut ref1, &first_row, &v); + } + + // Run the dual-lane version. + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { cheap_matmul_dual_asm_w8(&mut s0, &mut s1, &first_row, &v); } + + // Both lanes must match their references. + for i in 0..8 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + + /// Same dual-lane verification for width 12. + #[test] + fn test_cheap_matmul_dual_asm_w12( + vals0 in prop::array::uniform12(any::()), + vals1 in prop::array::uniform12(any::()), + first_row in prop::array::uniform12(any::()), + v in prop::array::uniform12(any::()), + ) { + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + cheap_matmul_asm_w12(&mut ref0, &first_row, &v); + cheap_matmul_asm_w12(&mut ref1, &first_row, &v); + } + + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { cheap_matmul_dual_asm_w12(&mut s0, &mut s1, &first_row, &v); } + + for i in 0..12 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + + // ================================================================ + // Dense matrix-vector multiply (full-round linear layer) + // ================================================================ + + /// Verify the width-8 dense matmul against a field-level reference. + /// + /// Each output element is the dot product of one matrix row with + /// the input vector. The matrix is fixed from a deterministic seed. + #[test] + fn test_dense_matmul_asm_w8(vals in prop::array::uniform8(any::())) { + // Fixed matrix from a deterministic seed. + let mut rng = SmallRng::seed_from_u64(42); + let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng); + + // Reference: standard matrix-vector product using field ops. + let f: [F; 8] = vals.map(F::new); + let expected: [F; 8] = core::array::from_fn(|i| { + (0..8).map(|j| f[j] * F::new(m[i][j])).sum() + }); + + // Run the ASM version. + let mut state = vals; + dense_matmul_asm_w8(&mut state, &m); + + // Every element must match. + for i in 0..8 { + prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); + } + } + + /// Same verification for width 12. + #[test] + fn test_dense_matmul_asm_w12(vals in prop::array::uniform12(any::())) { + let mut rng = SmallRng::seed_from_u64(43); + let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng); + + let f: [F; 12] = vals.map(F::new); + let expected: [F; 12] = core::array::from_fn(|i| { + (0..12).map(|j| f[j] * F::new(m[i][j])).sum() + }); + + let mut state = vals; + dense_matmul_asm_w12(&mut state, &m); + + for i in 0..12 { + prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); + } + } + + /// Verify the width-8 dual-lane dense matmul matches two + /// independent single-lane calls. + #[test] + fn test_dense_matmul_dual_asm_w8( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + ) { + // Fixed matrix from a deterministic seed. + let mut rng = SmallRng::seed_from_u64(44); + let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng); + + // Reference: single-lane on each independently. + let mut ref0 = vals0; + let mut ref1 = vals1; + dense_matmul_asm_w8(&mut ref0, &m); + dense_matmul_asm_w8(&mut ref1, &m); + + // Run the dual-lane version. + let mut s0 = vals0; + let mut s1 = vals1; + dense_matmul_dual_asm_w8(&mut s0, &mut s1, &m); + + // Both lanes must match their references. + for i in 0..8 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + + /// Same dual-lane verification for width 12. + #[test] + fn test_dense_matmul_dual_asm_w12( + vals0 in prop::array::uniform12(any::()), + vals1 in prop::array::uniform12(any::()), + ) { + let mut rng = SmallRng::seed_from_u64(45); + let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng); + + let mut ref0 = vals0; + let mut ref1 = vals1; + dense_matmul_asm_w12(&mut ref0, &m); + dense_matmul_asm_w12(&mut ref1, &m); + + let mut s0 = vals0; + let mut s1 = vals1; + dense_matmul_dual_asm_w12(&mut s0, &mut s1, &m); + + for i in 0..12 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs new file mode 100644 index 000000000..cf74b4df8 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs @@ -0,0 +1,652 @@ +//! Optimized Poseidon2 for Goldilocks on aarch64. +//! +//! Uses ARM inline assembly with latency hiding via interleaved S-box/MDS computation. +//! Fully unrolled internal rounds for W8, W12, W16. +//! +//! For packed operations, lanes are extracted to scalar, processed with interleaved +//! dual-lane ASM, then repacked. This is faster than using PackedGoldilocksNeon +//! arithmetic directly because the scalar `add_asm` avoids the modular reduction +//! overhead present in NEON addition. + +use alloc::vec::Vec; + +use p3_poseidon2::{ + ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, InternalLayer, + InternalLayerConstructor, poseidon2_round_numbers_128, +}; +use p3_symmetric::{CryptographicPermutation, Permutation}; +use rand::distr::{Distribution, StandardUniform}; +use rand::{Rng, RngExt}; + +use super::packing::PackedGoldilocksNeon; +use super::poseidon2_asm::*; +use super::utils::{pack_lanes, unpack_lanes}; +use crate::{Goldilocks, MATRIX_DIAG_20_GOLDILOCKS}; + +/// Degree of the chosen permutation polynomial for Goldilocks. +const GOLDILOCKS_S_BOX_DEGREE: u64 = 7; + +/// ASM-optimized internal layer with split-state s0-in-register, pre-converted constants. +#[derive(Debug, Default, Clone)] +pub struct Poseidon2InternalLayerGoldilocksAsm { + constants_raw: Vec, +} + +impl InternalLayerConstructor for Poseidon2InternalLayerGoldilocksAsm { + fn new_from_constants(internal_constants: Vec) -> Self { + let constants_raw = internal_constants.iter().map(|c| c.value).collect(); + Self { constants_raw } + } +} + +const DIAG_RAW_20: [u64; 20] = { + let mut arr = [0u64; 20]; + let mut i = 0; + while i < 20 { + arr[i] = MATRIX_DIAG_20_GOLDILOCKS[i].value; + i += 1; + } + arr +}; + +impl InternalLayer for Poseidon2InternalLayerGoldilocksAsm { + fn permute_state(&self, state: &mut [Goldilocks; 8]) { + let state_raw: &mut [u64; 8] = + unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; + internal_permute_state_asm_w8(state_raw, &self.constants_raw); + } +} + +impl InternalLayer + for Poseidon2InternalLayerGoldilocksAsm +{ + fn permute_state(&self, state: &mut [Goldilocks; 12]) { + let state_raw: &mut [u64; 12] = + unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; + internal_permute_state_asm_w12(state_raw, &self.constants_raw); + } +} + +impl InternalLayer + for Poseidon2InternalLayerGoldilocksAsm +{ + fn permute_state(&self, state: &mut [Goldilocks; 16]) { + let state_raw: &mut [u64; 16] = + unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; + internal_permute_state_asm_w16(state_raw, &self.constants_raw); + } +} + +impl InternalLayer + for Poseidon2InternalLayerGoldilocksAsm +{ + fn permute_state(&self, state: &mut [Goldilocks; 20]) { + let state_raw: &mut [u64; 20] = + unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; + internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.constants_raw); + } +} + +#[derive(Clone)] +pub struct Poseidon2ExternalLayerGoldilocksAsm { + initial_constants_raw: Vec<[u64; WIDTH]>, + terminal_constants_raw: Vec<[u64; WIDTH]>, +} + +impl ExternalLayerConstructor + for Poseidon2ExternalLayerGoldilocksAsm +{ + fn new_from_constants(external_constants: ExternalLayerConstants) -> Self { + let initial_constants_raw = external_constants + .get_initial_constants() + .iter() + .map(|rc| core::array::from_fn(|i| rc[i].value)) + .collect(); + let terminal_constants_raw = external_constants + .get_terminal_constants() + .iter() + .map(|rc| core::array::from_fn(|i| rc[i].value)) + .collect(); + Self { + initial_constants_raw, + terminal_constants_raw, + } + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<8> +{ + fn permute_state_initial(&self, state: &mut [Goldilocks; 8]) { + let state_raw: &mut [u64; 8] = + unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; + external_initial_permute_w8(state_raw, &self.initial_constants_raw); + } + + fn permute_state_terminal(&self, state: &mut [Goldilocks; 8]) { + let state_raw: &mut [u64; 8] = + unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; + external_terminal_permute_w8(state_raw, &self.terminal_constants_raw); + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<12> +{ + fn permute_state_initial(&self, state: &mut [Goldilocks; 12]) { + let state_raw: &mut [u64; 12] = + unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; + external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); + } + + fn permute_state_terminal(&self, state: &mut [Goldilocks; 12]) { + let state_raw: &mut [u64; 12] = + unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; + external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<16> +{ + fn permute_state_initial(&self, state: &mut [Goldilocks; 16]) { + let state_raw: &mut [u64; 16] = + unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; + external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); + } + + fn permute_state_terminal(&self, state: &mut [Goldilocks; 16]) { + let state_raw: &mut [u64; 16] = + unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; + external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<20> +{ + fn permute_state_initial(&self, state: &mut [Goldilocks; 20]) { + let state_raw: &mut [u64; 20] = + unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; + external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); + } + + fn permute_state_terminal(&self, state: &mut [Goldilocks; 20]) { + let state_raw: &mut [u64; 20] = + unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; + external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); + } +} + +/// Type alias for scalar ASM-optimized Poseidon2. +pub type Poseidon2GoldilocksAsm = p3_poseidon2::Poseidon2< + Goldilocks, + Poseidon2ExternalLayerGoldilocksAsm, + Poseidon2InternalLayerGoldilocksAsm, + WIDTH, + GOLDILOCKS_S_BOX_DEGREE, +>; + +impl InternalLayer + for Poseidon2InternalLayerGoldilocksAsm +{ + fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 8]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl InternalLayer + for Poseidon2InternalLayerGoldilocksAsm +{ + fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 12]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + internal_permute_split_dual_w12(&mut lane0, &mut lane1, &self.constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl InternalLayer + for Poseidon2InternalLayerGoldilocksAsm +{ + fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 16]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + internal_permute_split_dual_w16(&mut lane0, &mut lane1, &self.constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl InternalLayer + for Poseidon2InternalLayerGoldilocksAsm +{ + fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 20]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + internal_permute_split_dual(&mut lane0, &mut lane1, &DIAG_RAW_20, &self.constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<8> +{ + fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 8]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw); + pack_lanes(state, &lane0, &lane1); + } + + fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 8]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<12> +{ + fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 12]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw); + pack_lanes(state, &lane0, &lane1); + } + + fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 12]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<16> +{ + fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 16]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw); + pack_lanes(state, &lane0, &lane1); + } + + fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 16]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl ExternalLayer + for Poseidon2ExternalLayerGoldilocksAsm<20> +{ + fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 20]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw); + pack_lanes(state, &lane0, &lane1); + } + + fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 20]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +/// Fused Poseidon2 permutation for Goldilocks. +/// +/// Instead of unpacking/packing between each of the 3 phases (initial external, +/// internal, terminal external), this performs a single unpack at the start and +/// a single pack at the end, eliminating the redundant lane conversions per +/// packed permutation. +#[derive(Clone, Debug)] +pub struct Poseidon2GoldilocksFused { + internal_constants_raw: Vec, + initial_constants_raw: Vec<[u64; WIDTH]>, + terminal_constants_raw: Vec<[u64; WIDTH]>, +} + +impl Poseidon2GoldilocksFused { + pub fn new( + external_constants: &ExternalLayerConstants, + internal_constants: &[Goldilocks], + ) -> Self { + let internal_constants_raw = internal_constants.iter().map(|c| c.value).collect(); + let initial_constants_raw = external_constants + .get_initial_constants() + .iter() + .map(|rc| core::array::from_fn(|i| rc[i].value)) + .collect(); + let terminal_constants_raw = external_constants + .get_terminal_constants() + .iter() + .map(|rc| core::array::from_fn(|i| rc[i].value)) + .collect(); + Self { + internal_constants_raw, + initial_constants_raw, + terminal_constants_raw, + } + } + + pub fn new_from_rng(rounds_f: usize, rounds_p: usize, rng: &mut R) -> Self + where + StandardUniform: Distribution + Distribution<[Goldilocks; WIDTH]>, + { + let external_constants = ExternalLayerConstants::new_from_rng(rounds_f, rng); + let internal_constants = rng + .sample_iter(StandardUniform) + .take(rounds_p) + .collect::>(); + Self::new(&external_constants, &internal_constants) + } + + pub fn new_from_rng_128(rng: &mut R) -> Self + where + StandardUniform: Distribution + Distribution<[Goldilocks; WIDTH]>, + { + let round_numbers = + poseidon2_round_numbers_128::(WIDTH, GOLDILOCKS_S_BOX_DEGREE); + let (rounds_f, rounds_p) = round_numbers.unwrap_or_else(|e| panic!("{e}")); + Self::new_from_rng(rounds_f, rounds_p, rng) + } +} + +impl Permutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> { + fn permute_mut(&self, state: &mut [Goldilocks; 8]) { + let state_raw: &mut [u64; 8] = + unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; + external_initial_permute_w8(state_raw, &self.initial_constants_raw); + internal_permute_state_asm_w8(state_raw, &self.internal_constants_raw); + external_terminal_permute_w8(state_raw, &self.terminal_constants_raw); + } +} + +impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> {} + +impl Permutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> { + fn permute_mut(&self, state: &mut [Goldilocks; 12]) { + let state_raw: &mut [u64; 12] = + unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; + external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); + internal_permute_state_asm_w12(state_raw, &self.internal_constants_raw); + external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); + } +} + +impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> {} + +impl Permutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> { + fn permute_mut(&self, state: &mut [Goldilocks; 16]) { + let state_raw: &mut [u64; 16] = + unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; + external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); + internal_permute_state_asm_w16(state_raw, &self.internal_constants_raw); + external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); + } +} + +impl CryptographicPermutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> {} + +impl Permutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> { + fn permute_mut(&self, state: &mut [Goldilocks; 20]) { + let state_raw: &mut [u64; 20] = + unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; + external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); + internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.internal_constants_raw); + external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); + } +} + +impl CryptographicPermutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> {} + +impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw); + internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.internal_constants_raw); + external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw); + pack_lanes(state, &lane0, &lane1); + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> {} + +impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + let mut sv = lanes_to_neon(&lane0, &lane1); + external_initial_neon(&mut sv, &self.initial_constants_raw); + internal_permute_neon_w12(&mut sv, &self.internal_constants_raw); + external_terminal_neon(&mut sv, &self.terminal_constants_raw); + neon_to_lanes(&sv, &mut lane0, &mut lane1); + pack_lanes(state, &lane0, &lane1); + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> {} + +impl Permutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 16]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + let mut sv = lanes_to_neon(&lane0, &lane1); + external_initial_neon(&mut sv, &self.initial_constants_raw); + internal_permute_neon_w16(&mut sv, &self.internal_constants_raw); + external_terminal_neon(&mut sv, &self.terminal_constants_raw); + neon_to_lanes(&sv, &mut lane0, &mut lane1); + pack_lanes(state, &lane0, &lane1); + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> {} + +impl Permutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> { + fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 20]) { + let (mut lane0, mut lane1) = unpack_lanes(state); + let mut sv = lanes_to_neon(&lane0, &lane1); + external_initial_neon(&mut sv, &self.initial_constants_raw); + internal_permute_neon(&mut sv, &DIAG_RAW_20, &self.internal_constants_raw); + external_terminal_neon(&mut sv, &self.terminal_constants_raw); + neon_to_lanes(&sv, &mut lane0, &mut lane1); + pack_lanes(state, &lane0, &lane1); + } +} + +impl CryptographicPermutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> {} + +#[cfg(test)] +mod tests { + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + use p3_poseidon2::{ExternalLayerConstants, InternalLayer, Poseidon2}; + use p3_symmetric::Permutation; + use rand::rngs::SmallRng; + use rand::{RngExt, SeedableRng}; + + use super::*; + use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE; + use crate::{ + GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8, + Poseidon2ExternalLayerGoldilocks, Poseidon2InternalLayerGoldilocks, + }; + + type F = Goldilocks; + + // Test that fully ASM-optimized implementation matches generic scalar + fn test_asm_matches_generic() + where + Poseidon2InternalLayerGoldilocks: InternalLayer, + Poseidon2InternalLayerGoldilocksAsm: InternalLayer, + Poseidon2ExternalLayerGoldilocksAsm: + ExternalLayer, + { + let mut rng = SmallRng::seed_from_u64(42); + + let external_constants = ExternalLayerConstants::::new_from_rng( + 2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, + &mut rng, + ); + let internal_constants: Vec = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8) + .map(|_| F::from_u64(rng.random())) + .collect(); + + // Generic scalar implementation + let generic_poseidon2: Poseidon2< + Goldilocks, + Poseidon2ExternalLayerGoldilocks, + Poseidon2InternalLayerGoldilocks, + WIDTH, + GOLDILOCKS_S_BOX_DEGREE, + > = Poseidon2::new(external_constants.clone(), internal_constants.clone()); + + // Fully ASM-optimized implementation + let asm_poseidon2: Poseidon2GoldilocksAsm = + Poseidon2::new(external_constants, internal_constants); + + // Test with zeros + let mut generic_input = [F::ZERO; WIDTH]; + let mut asm_input = [F::ZERO; WIDTH]; + + generic_poseidon2.permute_mut(&mut generic_input); + asm_poseidon2.permute_mut(&mut asm_input); + + for i in 0..WIDTH { + assert_eq!( + asm_input[i].as_canonical_u64(), + generic_input[i].as_canonical_u64(), + "ASM mismatch at index {i} for zero input" + ); + } + + // Test with random input + let mut generic_input: [F; WIDTH] = core::array::from_fn(|_| F::from_u64(rng.random())); + let mut asm_input = generic_input; + + generic_poseidon2.permute_mut(&mut generic_input); + asm_poseidon2.permute_mut(&mut asm_input); + + for i in 0..WIDTH { + assert_eq!( + asm_input[i].as_canonical_u64(), + generic_input[i].as_canonical_u64(), + "ASM mismatch at index {i} for random input" + ); + } + } + + fn test_fused_matches_generic() + where + Poseidon2InternalLayerGoldilocks: InternalLayer, + Poseidon2GoldilocksFused: + Permutation<[F; WIDTH]> + Permutation<[PackedGoldilocksNeon; WIDTH]>, + { + let mut rng = SmallRng::seed_from_u64(42); + + let external_constants = ExternalLayerConstants::::new_from_rng( + 2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, + &mut rng, + ); + let internal_constants: Vec = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8) + .map(|_| rng.random()) + .collect(); + + let generic_poseidon2: Poseidon2< + Goldilocks, + Poseidon2ExternalLayerGoldilocks, + Poseidon2InternalLayerGoldilocks, + WIDTH, + GOLDILOCKS_S_BOX_DEGREE, + > = Poseidon2::new(external_constants.clone(), internal_constants.clone()); + + let fused = + Poseidon2GoldilocksFused::::new(&external_constants, &internal_constants); + + // Scalar: fused vs generic + let mut generic_input = [F::ZERO; WIDTH]; + let mut fused_input = [F::ZERO; WIDTH]; + generic_poseidon2.permute_mut(&mut generic_input); + fused.permute_mut(&mut fused_input); + for i in 0..WIDTH { + assert_eq!( + fused_input[i].as_canonical_u64(), + generic_input[i].as_canonical_u64(), + "Fused scalar mismatch at index {i} for zero input" + ); + } + + let mut generic_input: [F; WIDTH] = rng.random(); + let mut fused_input = generic_input; + generic_poseidon2.permute_mut(&mut generic_input); + fused.permute_mut(&mut fused_input); + for i in 0..WIDTH { + assert_eq!( + fused_input[i].as_canonical_u64(), + generic_input[i].as_canonical_u64(), + "Fused scalar mismatch at index {i} for random input" + ); + } + + // Packed: fused packed vs scalar (each packed lane should match scalar) + let scalar_a: [F; WIDTH] = rng.random(); + let scalar_b: [F; WIDTH] = rng.random(); + + let mut packed_input: [PackedGoldilocksNeon; WIDTH] = + core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]])); + fused.permute_mut(&mut packed_input); + + let mut expected_a = scalar_a; + let mut expected_b = scalar_b; + fused.permute_mut(&mut expected_a); + fused.permute_mut(&mut expected_b); + + for i in 0..WIDTH { + assert_eq!( + packed_input[i].0[0].as_canonical_u64(), + expected_a[i].as_canonical_u64(), + "Fused packed lane0 mismatch at index {i}" + ); + assert_eq!( + packed_input[i].0[1].as_canonical_u64(), + expected_b[i].as_canonical_u64(), + "Fused packed lane1 mismatch at index {i}" + ); + } + } + + #[test] + fn test_asm_matches_generic_width_8() { + test_asm_matches_generic::<8>(); + } + + #[test] + fn test_asm_matches_generic_width_12() { + test_asm_matches_generic::<12>(); + } + + #[test] + fn test_asm_matches_generic_width_16() { + test_asm_matches_generic::<16>(); + } + + #[test] + fn test_asm_matches_generic_width_20() { + test_asm_matches_generic::<20>(); + } + + #[test] + fn test_fused_matches_generic_width_8() { + test_fused_matches_generic::<8>(); + } + + #[test] + fn test_fused_matches_generic_width_12() { + test_fused_matches_generic::<12>(); + } + + #[test] + fn test_fused_matches_generic_width_16() { + test_fused_matches_generic::<16>(); + } + + #[test] + fn test_fused_matches_generic_width_20() { + test_fused_matches_generic::<20>(); + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs new file mode 100644 index 000000000..00b7fdc57 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs @@ -0,0 +1,2621 @@ +//! ARM assembly primitives for Poseidon2 on Goldilocks. +//! +//! Latency hiding: ARM mul/umulh have ~4-5 cycle latency. By interleaving +//! S-box computation with MDS operations, we hide much of this latency. + +use core::arch::aarch64::*; +use core::arch::asm; + +use super::utils::{add_asm, mul_add_asm, mul_asm}; +use crate::P; + +/// Compute x / 2 in the Goldilocks field, matching `halve_u64::

`. +#[inline(always)] +unsafe fn div2_asm(x: u64) -> u64 { + let shift = (P + 1) >> 1; + let result: u64; + let _tmp: u64; + + unsafe { + asm!( + // result = x >> 1 + "lsr {result}, {x}, #1", + // tmp = x & 1 + "and {tmp}, {x}, #1", + // if tmp != 0 (x odd), tmp := shift, else tmp := 0 + "cmp {tmp}, #0", + "csel {tmp}, {shift}, xzr, ne", + // result += tmp + "add {result}, {result}, {tmp}", + x = in(reg) x, + shift = in(reg) shift, + tmp = out(reg) _tmp, + result = out(reg) result, + options(pure, nomem, nostack), + ); + } + + result +} + +#[inline(always)] +unsafe fn div4_asm(x: u64) -> u64 { + unsafe { div2_asm(div2_asm(x)) } +} + +#[inline(always)] +unsafe fn div8_asm(x: u64) -> u64 { + unsafe { div2_asm(div4_asm(x)) } +} + +#[inline(always)] +unsafe fn div16_asm(x: u64) -> u64 { + unsafe { div2_asm(div8_asm(x)) } +} + +#[inline(always)] +unsafe fn div32_asm(x: u64) -> u64 { + unsafe { div4_asm(div8_asm(x)) } +} + +/// Compute x * 2^{-32} mod P using the Goldilocks structure. +/// +/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P). +/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P, +/// where x_hi = x >> 32, x_lo = x & 0xFFFFFFFF. +#[inline(always)] +unsafe fn div_2_32_asm(x: u64) -> u64 { + let result: u64; + let _hi: u64; + let _lo: u64; + let _t: u64; + let _sum: u64; + let _adj: u64; + + unsafe { + asm!( + "lsr {hi}, {x}, #32", + "and {lo}, {x}, #0xFFFFFFFF", + "add {sum}, {hi}, {lo}", + "lsl {t}, {lo}, #32", + "subs {result}, {sum}, {t}", + "csetm {adj:w}, cc", + "sub {result}, {result}, {adj}", + x = in(reg) x, + hi = out(reg) _hi, + lo = out(reg) _lo, + t = out(reg) _t, + sum = out(reg) _sum, + result = out(reg) result, + adj = lateout(reg) _adj, + options(pure, nomem, nostack), + ); + } + + result +} + +/// Subtract two Goldilocks elements with borrow handling using inline assembly. +#[inline(always)] +unsafe fn sub_asm(a: u64, b: u64) -> u64 { + let result: u64; + let _adj: u64; + + unsafe { + asm!( + "subs {result}, {a}, {b}", + "csetm {adj:w}, cc", + "sub {result}, {result}, {adj}", + a = in(reg) a, + b = in(reg) b, + result = out(reg) result, + adj = out(reg) _adj, + options(pure, nomem, nostack), + ); + } + + result +} + +/// Split-state generic internal permute: s0 stays in a register across all rounds. +#[inline] +#[allow(clippy::needless_range_loop)] +pub fn internal_permute_state_asm( + state: &mut [u64; WIDTH], + diag: &[u64; WIDTH], + constants: &[u64], +) { + let mut s0 = state[0]; + for &rc in constants { + unsafe { + s0 = add_asm(s0, rc); + let s0_2 = mul_asm(s0, s0); + let s0_3 = mul_asm(s0_2, s0); + let s0_4 = mul_asm(s0_2, s0_2); + s0 = mul_asm(s0_3, s0_4); + + let mut sum_hi: u64 = 0; + for i in 1..WIDTH { + sum_hi = add_asm(sum_hi, state[i]); + } + + let mut diag_muls: [u64; WIDTH] = [0; WIDTH]; + for i in 1..WIDTH { + diag_muls[i] = mul_asm(state[i], diag[i]); + } + + let sum = add_asm(sum_hi, s0); + s0 = mul_add_asm(s0, diag[0], sum); + + for i in 1..WIDTH { + state[i] = add_asm(diag_muls[i], sum); + } + } + } + state[0] = s0; +} + +/// Split-state generic dual-lane internal permute for packed processing. +#[inline] +#[allow(clippy::needless_range_loop)] +pub fn internal_permute_split_dual( + lane0: &mut [u64; WIDTH], + lane1: &mut [u64; WIDTH], + diag: &[u64; WIDTH], + constants: &[u64], +) { + let mut s0_a = lane0[0]; + let mut s0_b = lane1[0]; + for &rc in constants { + unsafe { + s0_a = add_asm(s0_a, rc); + s0_b = add_asm(s0_b, rc); + let s0_2_a = mul_asm(s0_a, s0_a); + let s0_2_b = mul_asm(s0_b, s0_b); + let s0_3_a = mul_asm(s0_2_a, s0_a); + let s0_3_b = mul_asm(s0_2_b, s0_b); + let s0_4_a = mul_asm(s0_2_a, s0_2_a); + let s0_4_b = mul_asm(s0_2_b, s0_2_b); + s0_a = mul_asm(s0_3_a, s0_4_a); + s0_b = mul_asm(s0_3_b, s0_4_b); + + let mut sum_hi_a: u64 = 0; + let mut sum_hi_b: u64 = 0; + for i in 1..WIDTH { + sum_hi_a = add_asm(sum_hi_a, lane0[i]); + sum_hi_b = add_asm(sum_hi_b, lane1[i]); + } + + let mut diag_muls_a: [u64; WIDTH] = [0; WIDTH]; + let mut diag_muls_b: [u64; WIDTH] = [0; WIDTH]; + for i in 1..WIDTH { + diag_muls_a[i] = mul_asm(lane0[i], diag[i]); + diag_muls_b[i] = mul_asm(lane1[i], diag[i]); + } + + let sum_a = add_asm(sum_hi_a, s0_a); + let sum_b = add_asm(sum_hi_b, s0_b); + s0_a = mul_add_asm(s0_a, diag[0], sum_a); + s0_b = mul_add_asm(s0_b, diag[0], sum_b); + + for i in 1..WIDTH { + lane0[i] = add_asm(diag_muls_a[i], sum_a); + lane1[i] = add_asm(diag_muls_b[i], sum_b); + } + } + } + lane0[0] = s0_a; + lane1[0] = s0_b; +} + +/// Split-state W8 internal permute: s0 stays in a register across all rounds. +#[inline] +pub fn internal_permute_state_asm_w8(state: &mut [u64; 8], constants: &[u64]) { + let mut s0 = state[0]; + for &rc in constants { + unsafe { + s0 = add_asm(s0, rc); + let s0_2 = mul_asm(s0, s0); + + let sum1 = add_asm(state[1], state[2]); + let sum2 = add_asm(state[3], state[4]); + let sum3 = add_asm(state[5], state[6]); + + let s0_3 = mul_asm(s0_2, s0); + let s0_4 = mul_asm(s0_2, s0_2); + + let sum12 = add_asm(sum1, sum2); + let sum37 = add_asm(sum3, state[7]); + + let d1 = state[1]; + let d2 = double_asm(state[2]); + let d3 = div2_asm(state[3]); + let d4 = add_asm(double_asm(state[4]), state[4]); + + let sum_hi = add_asm(sum12, sum37); + + let d5 = div2_asm(state[5]); + let d6 = add_asm(double_asm(state[6]), state[6]); + let d7 = double_asm(double_asm(state[7])); + + s0 = mul_asm(s0_3, s0_4); + let sum = add_asm(sum_hi, s0); + // V[0]=-2: new_s0 = sum + (-2)*s0 = sum_hi + s0 - 2*s0 = sum_hi - s0 + s0 = sub_asm(sum_hi, s0); + + state[1] = add_asm(d1, sum); + state[2] = add_asm(d2, sum); + state[3] = add_asm(d3, sum); + state[4] = add_asm(d4, sum); + state[5] = sub_asm(sum, d5); + state[6] = sub_asm(sum, d6); + state[7] = sub_asm(sum, d7); + } + } + state[0] = s0; +} + +/// Split-state dual-lane W8 internal permute for packed processing. +#[inline] +pub fn internal_permute_split_dual_w8( + lane0: &mut [u64; 8], + lane1: &mut [u64; 8], + constants: &[u64], +) { + let mut s0_a = lane0[0]; + let mut s0_b = lane1[0]; + for &rc in constants { + unsafe { + s0_a = add_asm(s0_a, rc); + s0_b = add_asm(s0_b, rc); + + let s0_2_a = mul_asm(s0_a, s0_a); + let s0_2_b = mul_asm(s0_b, s0_b); + + let sum1_a = add_asm(lane0[1], lane0[2]); + let sum1_b = add_asm(lane1[1], lane1[2]); + let sum2_a = add_asm(lane0[3], lane0[4]); + let sum2_b = add_asm(lane1[3], lane1[4]); + let sum3_a = add_asm(lane0[5], lane0[6]); + let sum3_b = add_asm(lane1[5], lane1[6]); + + let s0_3_a = mul_asm(s0_2_a, s0_a); + let s0_3_b = mul_asm(s0_2_b, s0_b); + let s0_4_a = mul_asm(s0_2_a, s0_2_a); + let s0_4_b = mul_asm(s0_2_b, s0_2_b); + + let sum12_a = add_asm(sum1_a, sum2_a); + let sum12_b = add_asm(sum1_b, sum2_b); + let sum37_a = add_asm(sum3_a, lane0[7]); + let sum37_b = add_asm(sum3_b, lane1[7]); + + let d1_a = lane0[1]; + let d1_b = lane1[1]; + let d2_a = double_asm(lane0[2]); + let d2_b = double_asm(lane1[2]); + let d3_a = div2_asm(lane0[3]); + let d3_b = div2_asm(lane1[3]); + let d4_a = add_asm(double_asm(lane0[4]), lane0[4]); + let d4_b = add_asm(double_asm(lane1[4]), lane1[4]); + + let sum_hi_a = add_asm(sum12_a, sum37_a); + let sum_hi_b = add_asm(sum12_b, sum37_b); + + let d5_a = div2_asm(lane0[5]); + let d5_b = div2_asm(lane1[5]); + let d6_a = add_asm(double_asm(lane0[6]), lane0[6]); + let d6_b = add_asm(double_asm(lane1[6]), lane1[6]); + let d7_a = double_asm(double_asm(lane0[7])); + let d7_b = double_asm(double_asm(lane1[7])); + + s0_a = mul_asm(s0_3_a, s0_4_a); + s0_b = mul_asm(s0_3_b, s0_4_b); + + let sum_a = add_asm(sum_hi_a, s0_a); + let sum_b = add_asm(sum_hi_b, s0_b); + s0_a = sub_asm(sum_hi_a, s0_a); + s0_b = sub_asm(sum_hi_b, s0_b); + + lane0[1] = add_asm(d1_a, sum_a); + lane1[1] = add_asm(d1_b, sum_b); + lane0[2] = add_asm(d2_a, sum_a); + lane1[2] = add_asm(d2_b, sum_b); + lane0[3] = add_asm(d3_a, sum_a); + lane1[3] = add_asm(d3_b, sum_b); + lane0[4] = add_asm(d4_a, sum_a); + lane1[4] = add_asm(d4_b, sum_b); + lane0[5] = sub_asm(sum_a, d5_a); + lane1[5] = sub_asm(sum_b, d5_b); + lane0[6] = sub_asm(sum_a, d6_a); + lane1[6] = sub_asm(sum_b, d6_b); + lane0[7] = sub_asm(sum_a, d7_a); + lane1[7] = sub_asm(sum_b, d7_b); + } + } + lane0[0] = s0_a; + lane1[0] = s0_b; +} + +/// Split-state W12 internal permute: s0 stays in a register across all rounds. +#[inline] +pub fn internal_permute_state_asm_w12(state: &mut [u64; 12], constants: &[u64]) { + let mut s0 = state[0]; + for &rc in constants { + unsafe { + s0 = add_asm(s0, rc); + let s0_2 = mul_asm(s0, s0); + + let sum1 = add_asm(state[1], state[2]); + let sum2 = add_asm(state[3], state[4]); + let sum3 = add_asm(state[5], state[6]); + let sum4 = add_asm(state[7], state[8]); + let sum5 = add_asm(state[9], state[10]); + + let s0_3 = mul_asm(s0_2, s0); + let s0_4 = mul_asm(s0_2, s0_2); + + let sum12 = add_asm(sum1, sum2); + let sum34 = add_asm(sum3, sum4); + let sum511 = add_asm(sum5, state[11]); + + let d1 = state[1]; + let d2 = double_asm(state[2]); + let d3 = div2_asm(state[3]); + let d4 = add_asm(double_asm(state[4]), state[4]); + + let sum1234 = add_asm(sum12, sum34); + + let d5 = double_asm(double_asm(state[5])); + let d6 = div2_asm(state[6]); + let d7 = add_asm(double_asm(state[7]), state[7]); + let d8 = double_asm(double_asm(state[8])); + + let sum_hi = add_asm(sum1234, sum511); + + let d9 = div4_asm(state[9]); + let d10 = div4_asm(state[10]); + let d11 = div8_asm(state[11]); + + s0 = mul_asm(s0_3, s0_4); + let sum = add_asm(sum_hi, s0); + s0 = sub_asm(sum_hi, s0); + + state[1] = add_asm(d1, sum); + state[2] = add_asm(d2, sum); + state[3] = add_asm(d3, sum); + state[4] = add_asm(d4, sum); + state[5] = add_asm(d5, sum); + state[6] = sub_asm(sum, d6); + state[7] = sub_asm(sum, d7); + state[8] = sub_asm(sum, d8); + state[9] = add_asm(d9, sum); + state[10] = sub_asm(sum, d10); + state[11] = add_asm(d11, sum); + } + } + state[0] = s0; +} + +/// Split-state dual-lane W12 internal permute for packed processing. +#[inline] +pub fn internal_permute_split_dual_w12( + lane0: &mut [u64; 12], + lane1: &mut [u64; 12], + constants: &[u64], +) { + let mut s0_a = lane0[0]; + let mut s0_b = lane1[0]; + for &rc in constants { + unsafe { + s0_a = add_asm(s0_a, rc); + s0_b = add_asm(s0_b, rc); + + let s0_2_a = mul_asm(s0_a, s0_a); + let s0_2_b = mul_asm(s0_b, s0_b); + + let sum1_a = add_asm(lane0[1], lane0[2]); + let sum1_b = add_asm(lane1[1], lane1[2]); + let sum2_a = add_asm(lane0[3], lane0[4]); + let sum2_b = add_asm(lane1[3], lane1[4]); + let sum3_a = add_asm(lane0[5], lane0[6]); + let sum3_b = add_asm(lane1[5], lane1[6]); + let sum4_a = add_asm(lane0[7], lane0[8]); + let sum4_b = add_asm(lane1[7], lane1[8]); + let sum5_a = add_asm(lane0[9], lane0[10]); + let sum5_b = add_asm(lane1[9], lane1[10]); + + let s0_3_a = mul_asm(s0_2_a, s0_a); + let s0_3_b = mul_asm(s0_2_b, s0_b); + let s0_4_a = mul_asm(s0_2_a, s0_2_a); + let s0_4_b = mul_asm(s0_2_b, s0_2_b); + + let sum12_a = add_asm(sum1_a, sum2_a); + let sum12_b = add_asm(sum1_b, sum2_b); + let sum34_a = add_asm(sum3_a, sum4_a); + let sum34_b = add_asm(sum3_b, sum4_b); + let sum511_a = add_asm(sum5_a, lane0[11]); + let sum511_b = add_asm(sum5_b, lane1[11]); + + let d1_a = lane0[1]; + let d1_b = lane1[1]; + let d2_a = double_asm(lane0[2]); + let d2_b = double_asm(lane1[2]); + let d3_a = div2_asm(lane0[3]); + let d3_b = div2_asm(lane1[3]); + let d4_a = add_asm(double_asm(lane0[4]), lane0[4]); + let d4_b = add_asm(double_asm(lane1[4]), lane1[4]); + + let sum1234_a = add_asm(sum12_a, sum34_a); + let sum1234_b = add_asm(sum12_b, sum34_b); + + let d5_a = double_asm(double_asm(lane0[5])); + let d5_b = double_asm(double_asm(lane1[5])); + let d6_a = div2_asm(lane0[6]); + let d6_b = div2_asm(lane1[6]); + let d7_a = add_asm(double_asm(lane0[7]), lane0[7]); + let d7_b = add_asm(double_asm(lane1[7]), lane1[7]); + let d8_a = double_asm(double_asm(lane0[8])); + let d8_b = double_asm(double_asm(lane1[8])); + + let sum_hi_a = add_asm(sum1234_a, sum511_a); + let sum_hi_b = add_asm(sum1234_b, sum511_b); + + let d9_a = div4_asm(lane0[9]); + let d9_b = div4_asm(lane1[9]); + let d10_a = div4_asm(lane0[10]); + let d10_b = div4_asm(lane1[10]); + let d11_a = div8_asm(lane0[11]); + let d11_b = div8_asm(lane1[11]); + + s0_a = mul_asm(s0_3_a, s0_4_a); + s0_b = mul_asm(s0_3_b, s0_4_b); + + let sum_a = add_asm(sum_hi_a, s0_a); + let sum_b = add_asm(sum_hi_b, s0_b); + s0_a = sub_asm(sum_hi_a, s0_a); + s0_b = sub_asm(sum_hi_b, s0_b); + + lane0[1] = add_asm(d1_a, sum_a); + lane1[1] = add_asm(d1_b, sum_b); + lane0[2] = add_asm(d2_a, sum_a); + lane1[2] = add_asm(d2_b, sum_b); + lane0[3] = add_asm(d3_a, sum_a); + lane1[3] = add_asm(d3_b, sum_b); + lane0[4] = add_asm(d4_a, sum_a); + lane1[4] = add_asm(d4_b, sum_b); + lane0[5] = add_asm(d5_a, sum_a); + lane1[5] = add_asm(d5_b, sum_b); + lane0[6] = sub_asm(sum_a, d6_a); + lane1[6] = sub_asm(sum_b, d6_b); + lane0[7] = sub_asm(sum_a, d7_a); + lane1[7] = sub_asm(sum_b, d7_b); + lane0[8] = sub_asm(sum_a, d8_a); + lane1[8] = sub_asm(sum_b, d8_b); + lane0[9] = add_asm(d9_a, sum_a); + lane1[9] = add_asm(d9_b, sum_b); + lane0[10] = sub_asm(sum_a, d10_a); + lane1[10] = sub_asm(sum_b, d10_b); + lane0[11] = add_asm(d11_a, sum_a); + lane1[11] = add_asm(d11_b, sum_b); + } + } + lane0[0] = s0_a; + lane1[0] = s0_b; +} + +/// Split-state W16 internal permute: s0 stays in a register across all rounds. +#[inline] +pub fn internal_permute_state_asm_w16(state: &mut [u64; 16], constants: &[u64]) { + let mut s0 = state[0]; + for &rc in constants { + unsafe { + s0 = add_asm(s0, rc); + let s0_2 = mul_asm(s0, s0); + + let sum1 = add_asm(state[1], state[2]); + let sum2 = add_asm(state[3], state[4]); + let sum3 = add_asm(state[5], state[6]); + let sum4 = add_asm(state[7], state[8]); + let sum5 = add_asm(state[9], state[10]); + let sum6 = add_asm(state[11], state[12]); + let sum7 = add_asm(state[13], state[14]); + + let s0_3 = mul_asm(s0_2, s0); + let s0_4 = mul_asm(s0_2, s0_2); + + let sum12 = add_asm(sum1, sum2); + let sum34 = add_asm(sum3, sum4); + let sum56 = add_asm(sum5, sum6); + let sum715 = add_asm(sum7, state[15]); + + let sum1234 = add_asm(sum12, sum34); + let sum56715 = add_asm(sum56, sum715); + let sum_hi = add_asm(sum1234, sum56715); + + let d1 = state[1]; + let d2 = double_asm(state[2]); + let d3 = div2_asm(state[3]); + let d4 = add_asm(double_asm(state[4]), state[4]); + let d5 = double_asm(double_asm(state[5])); + let d6 = div2_asm(state[6]); + let d7 = add_asm(double_asm(state[7]), state[7]); + let d8 = double_asm(double_asm(state[8])); + + let d9 = div8_asm(state[9]); + let d10 = div16_asm(state[10]); + let d11 = div32_asm(state[11]); + let d12 = div8_asm(state[12]); + let d13 = div16_asm(state[13]); + let d14 = div32_asm(state[14]); + let d15 = div_2_32_asm(state[15]); + + s0 = mul_asm(s0_3, s0_4); + let sum = add_asm(sum_hi, s0); + s0 = sub_asm(sum_hi, s0); + + state[1] = add_asm(d1, sum); + state[2] = add_asm(d2, sum); + state[3] = add_asm(d3, sum); + state[4] = add_asm(d4, sum); + state[5] = add_asm(d5, sum); + state[6] = sub_asm(sum, d6); + state[7] = sub_asm(sum, d7); + state[8] = sub_asm(sum, d8); + state[9] = add_asm(d9, sum); + state[10] = add_asm(d10, sum); + state[11] = add_asm(d11, sum); + state[12] = sub_asm(sum, d12); + state[13] = sub_asm(sum, d13); + state[14] = sub_asm(sum, d14); + state[15] = add_asm(d15, sum); + } + } + state[0] = s0; +} + +/// Split-state dual-lane W16 internal permute for packed processing. +#[inline] +pub fn internal_permute_split_dual_w16( + lane0: &mut [u64; 16], + lane1: &mut [u64; 16], + constants: &[u64], +) { + let mut s0_a = lane0[0]; + let mut s0_b = lane1[0]; + for &rc in constants { + unsafe { + s0_a = add_asm(s0_a, rc); + s0_b = add_asm(s0_b, rc); + + let s0_2_a = mul_asm(s0_a, s0_a); + let s0_2_b = mul_asm(s0_b, s0_b); + + let sum1_a = add_asm(lane0[1], lane0[2]); + let sum1_b = add_asm(lane1[1], lane1[2]); + let sum2_a = add_asm(lane0[3], lane0[4]); + let sum2_b = add_asm(lane1[3], lane1[4]); + let sum3_a = add_asm(lane0[5], lane0[6]); + let sum3_b = add_asm(lane1[5], lane1[6]); + let sum4_a = add_asm(lane0[7], lane0[8]); + let sum4_b = add_asm(lane1[7], lane1[8]); + let sum5_a = add_asm(lane0[9], lane0[10]); + let sum5_b = add_asm(lane1[9], lane1[10]); + let sum6_a = add_asm(lane0[11], lane0[12]); + let sum6_b = add_asm(lane1[11], lane1[12]); + let sum7_a = add_asm(lane0[13], lane0[14]); + let sum7_b = add_asm(lane1[13], lane1[14]); + + let s0_3_a = mul_asm(s0_2_a, s0_a); + let s0_3_b = mul_asm(s0_2_b, s0_b); + let s0_4_a = mul_asm(s0_2_a, s0_2_a); + let s0_4_b = mul_asm(s0_2_b, s0_2_b); + + let sum12_a = add_asm(sum1_a, sum2_a); + let sum12_b = add_asm(sum1_b, sum2_b); + let sum34_a = add_asm(sum3_a, sum4_a); + let sum34_b = add_asm(sum3_b, sum4_b); + let sum56_a = add_asm(sum5_a, sum6_a); + let sum56_b = add_asm(sum5_b, sum6_b); + let sum715_a = add_asm(sum7_a, lane0[15]); + let sum715_b = add_asm(sum7_b, lane1[15]); + + let sum1234_a = add_asm(sum12_a, sum34_a); + let sum1234_b = add_asm(sum12_b, sum34_b); + let sum56715_a = add_asm(sum56_a, sum715_a); + let sum56715_b = add_asm(sum56_b, sum715_b); + let sum_hi_a = add_asm(sum1234_a, sum56715_a); + let sum_hi_b = add_asm(sum1234_b, sum56715_b); + + let d1_a = lane0[1]; + let d1_b = lane1[1]; + let d2_a = double_asm(lane0[2]); + let d2_b = double_asm(lane1[2]); + let d3_a = div2_asm(lane0[3]); + let d3_b = div2_asm(lane1[3]); + let d4_a = add_asm(double_asm(lane0[4]), lane0[4]); + let d4_b = add_asm(double_asm(lane1[4]), lane1[4]); + let d5_a = double_asm(double_asm(lane0[5])); + let d5_b = double_asm(double_asm(lane1[5])); + let d6_a = div2_asm(lane0[6]); + let d6_b = div2_asm(lane1[6]); + let d7_a = add_asm(double_asm(lane0[7]), lane0[7]); + let d7_b = add_asm(double_asm(lane1[7]), lane1[7]); + let d8_a = double_asm(double_asm(lane0[8])); + let d8_b = double_asm(double_asm(lane1[8])); + + let d9_a = div8_asm(lane0[9]); + let d9_b = div8_asm(lane1[9]); + let d10_a = div16_asm(lane0[10]); + let d10_b = div16_asm(lane1[10]); + let d11_a = div32_asm(lane0[11]); + let d11_b = div32_asm(lane1[11]); + let d12_a = div8_asm(lane0[12]); + let d12_b = div8_asm(lane1[12]); + let d13_a = div16_asm(lane0[13]); + let d13_b = div16_asm(lane1[13]); + let d14_a = div32_asm(lane0[14]); + let d14_b = div32_asm(lane1[14]); + let d15_a = div_2_32_asm(lane0[15]); + let d15_b = div_2_32_asm(lane1[15]); + + s0_a = mul_asm(s0_3_a, s0_4_a); + s0_b = mul_asm(s0_3_b, s0_4_b); + + let sum_a = add_asm(sum_hi_a, s0_a); + let sum_b = add_asm(sum_hi_b, s0_b); + s0_a = sub_asm(sum_hi_a, s0_a); + s0_b = sub_asm(sum_hi_b, s0_b); + + lane0[1] = add_asm(d1_a, sum_a); + lane1[1] = add_asm(d1_b, sum_b); + lane0[2] = add_asm(d2_a, sum_a); + lane1[2] = add_asm(d2_b, sum_b); + lane0[3] = add_asm(d3_a, sum_a); + lane1[3] = add_asm(d3_b, sum_b); + lane0[4] = add_asm(d4_a, sum_a); + lane1[4] = add_asm(d4_b, sum_b); + lane0[5] = add_asm(d5_a, sum_a); + lane1[5] = add_asm(d5_b, sum_b); + lane0[6] = sub_asm(sum_a, d6_a); + lane1[6] = sub_asm(sum_b, d6_b); + lane0[7] = sub_asm(sum_a, d7_a); + lane1[7] = sub_asm(sum_b, d7_b); + lane0[8] = sub_asm(sum_a, d8_a); + lane1[8] = sub_asm(sum_b, d8_b); + lane0[9] = add_asm(d9_a, sum_a); + lane1[9] = add_asm(d9_b, sum_b); + lane0[10] = add_asm(d10_a, sum_a); + lane1[10] = add_asm(d10_b, sum_b); + lane0[11] = add_asm(d11_a, sum_a); + lane1[11] = add_asm(d11_b, sum_b); + lane0[12] = sub_asm(sum_a, d12_a); + lane1[12] = sub_asm(sum_b, d12_b); + lane0[13] = sub_asm(sum_a, d13_a); + lane1[13] = sub_asm(sum_b, d13_b); + lane0[14] = sub_asm(sum_a, d14_a); + lane1[14] = sub_asm(sum_b, d14_b); + lane0[15] = add_asm(d15_a, sum_a); + lane1[15] = add_asm(d15_b, sum_b); + } + } + lane0[0] = s0_a; + lane1[0] = s0_b; +} + +// External layer: S-box on all elements, then MDS. Pipelined for latency hiding. + +/// Double a Goldilocks element. +#[inline(always)] +unsafe fn double_asm(a: u64) -> u64 { + // SAFETY: add_asm is safe with valid Goldilocks field elements + unsafe { add_asm(a, a) } +} + +/// 4x4 circulant MDS with coefficients [2,3,1,1]. +#[inline(always)] +unsafe fn apply_mat4_asm(x: &mut [u64; 4]) { + unsafe { + let t01 = add_asm(x[0], x[1]); + let t23 = add_asm(x[2], x[3]); + let t0123 = add_asm(t01, t23); + let t01123 = add_asm(t0123, x[1]); + let t01233 = add_asm(t0123, x[3]); + + let y3 = add_asm(t01233, double_asm(x[0])); + let y1 = add_asm(t01123, double_asm(x[2])); + let y0 = add_asm(t01123, t01); + let y2 = add_asm(t01233, t23); + + x[0] = y0; + x[1] = y1; + x[2] = y2; + x[3] = y3; + } +} + +/// Poseidon2 MDS light permutation: 4x4 blocks + outer sums. +#[inline(always)] +pub unsafe fn mds_light_permutation_asm(state: &mut [u64; WIDTH]) { + unsafe { + // Apply M_4 to each consecutive four elements + let mut i = 0; + while i < WIDTH { + let chunk: &mut [u64; 4] = (&mut state[i..i + 4]).try_into().unwrap(); + apply_mat4_asm(chunk); + i += 4; + } + + // Compute the four sums of every 4th element + let mut sums = [0u64; 4]; + for j in (0..WIDTH).step_by(4) { + sums[0] = add_asm(sums[0], state[j]); + sums[1] = add_asm(sums[1], state[j + 1]); + sums[2] = add_asm(sums[2], state[j + 2]); + sums[3] = add_asm(sums[3], state[j + 3]); + } + + // Add sums back to state + for (i, elem) in state.iter_mut().enumerate() { + *elem = add_asm(*elem, sums[i % 4]); + } + } +} + +/// Pipelined S-box computation for all elements. +/// Computes x^7 for all elements by interleaving stages to hide latency. +#[inline(always)] +pub unsafe fn sbox_layer_asm(state: &mut [u64; WIDTH]) { + unsafe { + // Stage 1: Compute all x^2 values + let mut x2 = [0u64; WIDTH]; + for i in 0..WIDTH { + x2[i] = mul_asm(state[i], state[i]); + } + + // Stage 2: Compute x^3 and x^4 values interleaved + // x^3 = x^2 * x, x^4 = x^2 * x^2 + let mut x3 = [0u64; WIDTH]; + let mut x4 = [0u64; WIDTH]; + for i in 0..WIDTH { + x3[i] = mul_asm(x2[i], state[i]); + x4[i] = mul_asm(x2[i], x2[i]); + } + + // Stage 3: Compute x^7 = x^3 * x^4 + for i in 0..WIDTH { + state[i] = mul_asm(x3[i], x4[i]); + } + } +} + +/// Optimized external round: add RC, S-box, MDS. +#[inline(always)] +pub unsafe fn external_round_asm(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) { + unsafe { + // Add round constants + for i in 0..WIDTH { + state[i] = add_asm(state[i], rc[i]); + } + + // Apply S-box (x^7) to all elements + sbox_layer_asm(state); + + // Apply MDS light permutation + mds_light_permutation_asm(state); + } +} + +/// Interleaved dual-lane S-box layer for better ILP. +#[inline(always)] +pub unsafe fn sbox_layer_dual_asm( + state0: &mut [u64; WIDTH], + state1: &mut [u64; WIDTH], +) { + unsafe { + // Stage 1: Compute all x^2 values for both lanes (interleaved) + let mut x2_a = [0u64; WIDTH]; + let mut x2_b = [0u64; WIDTH]; + for i in 0..WIDTH { + x2_a[i] = mul_asm(state0[i], state0[i]); + x2_b[i] = mul_asm(state1[i], state1[i]); + } + + // Stage 2: Compute x^3 and x^4 for both lanes (interleaved) + let mut x3_a = [0u64; WIDTH]; + let mut x3_b = [0u64; WIDTH]; + let mut x4_a = [0u64; WIDTH]; + let mut x4_b = [0u64; WIDTH]; + for i in 0..WIDTH { + x3_a[i] = mul_asm(x2_a[i], state0[i]); + x3_b[i] = mul_asm(x2_b[i], state1[i]); + x4_a[i] = mul_asm(x2_a[i], x2_a[i]); + x4_b[i] = mul_asm(x2_b[i], x2_b[i]); + } + + // Stage 3: Compute x^7 = x^3 * x^4 for both lanes + for i in 0..WIDTH { + state0[i] = mul_asm(x3_a[i], x4_a[i]); + state1[i] = mul_asm(x3_b[i], x4_b[i]); + } + } +} + +/// Interleaved dual-lane external round for better ILP. +#[inline(always)] +pub unsafe fn external_round_dual_asm( + state0: &mut [u64; WIDTH], + state1: &mut [u64; WIDTH], + rc: &[u64; WIDTH], +) { + unsafe { + // Add round constants (interleaved) + for i in 0..WIDTH { + state0[i] = add_asm(state0[i], rc[i]); + state1[i] = add_asm(state1[i], rc[i]); + } + + // Apply S-box (interleaved dual-lane) + sbox_layer_dual_asm(state0, state1); + + // Apply MDS (sequential - MDS is mostly additions which are fast) + mds_light_permutation_asm(state0); + mds_light_permutation_asm(state1); + } +} + +/// Fully unrolled and fused external round for W8. +#[inline(always)] +pub unsafe fn external_round_fused_w8(state: &mut [u64; 8], rc: &[u64; 8]) { + unsafe { + let s0 = add_asm(state[0], rc[0]); + let s1 = add_asm(state[1], rc[1]); + let x2_0 = mul_asm(s0, s0); + let x2_1 = mul_asm(s1, s1); + + let s2 = add_asm(state[2], rc[2]); + let s3 = add_asm(state[3], rc[3]); + let x2_2 = mul_asm(s2, s2); + let x2_3 = mul_asm(s3, s3); + + let s4 = add_asm(state[4], rc[4]); + let s5 = add_asm(state[5], rc[5]); + let x2_4 = mul_asm(s4, s4); + let x2_5 = mul_asm(s5, s5); + + let s6 = add_asm(state[6], rc[6]); + let s7 = add_asm(state[7], rc[7]); + let x2_6 = mul_asm(s6, s6); + let x2_7 = mul_asm(s7, s7); + + let x3_0 = mul_asm(x2_0, s0); + let x3_1 = mul_asm(x2_1, s1); + let x4_0 = mul_asm(x2_0, x2_0); + let x4_1 = mul_asm(x2_1, x2_1); + let x3_2 = mul_asm(x2_2, s2); + let x3_3 = mul_asm(x2_3, s3); + let x4_2 = mul_asm(x2_2, x2_2); + let x4_3 = mul_asm(x2_3, x2_3); + let x3_4 = mul_asm(x2_4, s4); + let x3_5 = mul_asm(x2_5, s5); + let x4_4 = mul_asm(x2_4, x2_4); + let x4_5 = mul_asm(x2_5, x2_5); + let x3_6 = mul_asm(x2_6, s6); + let x3_7 = mul_asm(x2_7, s7); + let x4_6 = mul_asm(x2_6, x2_6); + let x4_7 = mul_asm(x2_7, x2_7); + + state[0] = mul_asm(x3_0, x4_0); + state[1] = mul_asm(x3_1, x4_1); + state[2] = mul_asm(x3_2, x4_2); + state[3] = mul_asm(x3_3, x4_3); + state[4] = mul_asm(x3_4, x4_4); + state[5] = mul_asm(x3_5, x4_5); + state[6] = mul_asm(x3_6, x4_6); + state[7] = mul_asm(x3_7, x4_7); + + mds_light_permutation_asm(state); + } +} + +/// Fully unrolled and fused dual-lane external round for W8. +#[inline(always)] +pub unsafe fn external_round_fused_dual_w8( + state0: &mut [u64; 8], + state1: &mut [u64; 8], + rc: &[u64; 8], +) { + unsafe { + // Half 1: elements 0-3 across both lanes + let s0_a = add_asm(state0[0], rc[0]); + let s0_b = add_asm(state1[0], rc[0]); + let s1_a = add_asm(state0[1], rc[1]); + let s1_b = add_asm(state1[1], rc[1]); + let s2_a = add_asm(state0[2], rc[2]); + let s2_b = add_asm(state1[2], rc[2]); + let s3_a = add_asm(state0[3], rc[3]); + let s3_b = add_asm(state1[3], rc[3]); + + let x2_0a = mul_asm(s0_a, s0_a); + let x2_0b = mul_asm(s0_b, s0_b); + let x2_1a = mul_asm(s1_a, s1_a); + let x2_1b = mul_asm(s1_b, s1_b); + let x2_2a = mul_asm(s2_a, s2_a); + let x2_2b = mul_asm(s2_b, s2_b); + let x2_3a = mul_asm(s3_a, s3_a); + let x2_3b = mul_asm(s3_b, s3_b); + + let x3_0a = mul_asm(x2_0a, s0_a); + let x3_0b = mul_asm(x2_0b, s0_b); + let x4_0a = mul_asm(x2_0a, x2_0a); + let x4_0b = mul_asm(x2_0b, x2_0b); + let x3_1a = mul_asm(x2_1a, s1_a); + let x3_1b = mul_asm(x2_1b, s1_b); + let x4_1a = mul_asm(x2_1a, x2_1a); + let x4_1b = mul_asm(x2_1b, x2_1b); + let x3_2a = mul_asm(x2_2a, s2_a); + let x3_2b = mul_asm(x2_2b, s2_b); + let x4_2a = mul_asm(x2_2a, x2_2a); + let x4_2b = mul_asm(x2_2b, x2_2b); + let x3_3a = mul_asm(x2_3a, s3_a); + let x3_3b = mul_asm(x2_3b, s3_b); + let x4_3a = mul_asm(x2_3a, x2_3a); + let x4_3b = mul_asm(x2_3b, x2_3b); + + state0[0] = mul_asm(x3_0a, x4_0a); + state1[0] = mul_asm(x3_0b, x4_0b); + state0[1] = mul_asm(x3_1a, x4_1a); + state1[1] = mul_asm(x3_1b, x4_1b); + state0[2] = mul_asm(x3_2a, x4_2a); + state1[2] = mul_asm(x3_2b, x4_2b); + state0[3] = mul_asm(x3_3a, x4_3a); + state1[3] = mul_asm(x3_3b, x4_3b); + + // Half 2: elements 4-7 across both lanes + let s4_a = add_asm(state0[4], rc[4]); + let s4_b = add_asm(state1[4], rc[4]); + let s5_a = add_asm(state0[5], rc[5]); + let s5_b = add_asm(state1[5], rc[5]); + let s6_a = add_asm(state0[6], rc[6]); + let s6_b = add_asm(state1[6], rc[6]); + let s7_a = add_asm(state0[7], rc[7]); + let s7_b = add_asm(state1[7], rc[7]); + + let x2_4a = mul_asm(s4_a, s4_a); + let x2_4b = mul_asm(s4_b, s4_b); + let x2_5a = mul_asm(s5_a, s5_a); + let x2_5b = mul_asm(s5_b, s5_b); + let x2_6a = mul_asm(s6_a, s6_a); + let x2_6b = mul_asm(s6_b, s6_b); + let x2_7a = mul_asm(s7_a, s7_a); + let x2_7b = mul_asm(s7_b, s7_b); + + let x3_4a = mul_asm(x2_4a, s4_a); + let x3_4b = mul_asm(x2_4b, s4_b); + let x4_4a = mul_asm(x2_4a, x2_4a); + let x4_4b = mul_asm(x2_4b, x2_4b); + let x3_5a = mul_asm(x2_5a, s5_a); + let x3_5b = mul_asm(x2_5b, s5_b); + let x4_5a = mul_asm(x2_5a, x2_5a); + let x4_5b = mul_asm(x2_5b, x2_5b); + let x3_6a = mul_asm(x2_6a, s6_a); + let x3_6b = mul_asm(x2_6b, s6_b); + let x4_6a = mul_asm(x2_6a, x2_6a); + let x4_6b = mul_asm(x2_6b, x2_6b); + let x3_7a = mul_asm(x2_7a, s7_a); + let x3_7b = mul_asm(x2_7b, s7_b); + let x4_7a = mul_asm(x2_7a, x2_7a); + let x4_7b = mul_asm(x2_7b, x2_7b); + + state0[4] = mul_asm(x3_4a, x4_4a); + state1[4] = mul_asm(x3_4b, x4_4b); + state0[5] = mul_asm(x3_5a, x4_5a); + state1[5] = mul_asm(x3_5b, x4_5b); + state0[6] = mul_asm(x3_6a, x4_6a); + state1[6] = mul_asm(x3_6b, x4_6b); + state0[7] = mul_asm(x3_7a, x4_7a); + state1[7] = mul_asm(x3_7b, x4_7b); + + mds_light_permutation_asm(state0); + mds_light_permutation_asm(state1); + } +} + +/// Run initial external rounds with pre-converted raw u64 constants. +#[inline] +pub fn external_initial_permute_state_asm( + state: &mut [u64; WIDTH], + initial_constants: &[[u64; WIDTH]], +) { + unsafe { + mds_light_permutation_asm(state); + } + for rc in initial_constants { + unsafe { + external_round_asm(state, rc); + } + } +} + +/// Run terminal external rounds with pre-converted raw u64 constants. +#[inline] +pub fn external_terminal_permute_state_asm( + state: &mut [u64; WIDTH], + terminal_constants: &[[u64; WIDTH]], +) { + for rc in terminal_constants { + unsafe { + external_round_asm(state, rc); + } + } +} + +/// W8-specialized initial external permute using fused rounds. +#[inline] +pub fn external_initial_permute_w8(state: &mut [u64; 8], initial_constants: &[[u64; 8]]) { + unsafe { + mds_light_permutation_asm(state); + } + for rc in initial_constants { + unsafe { + external_round_fused_w8(state, rc); + } + } +} + +/// W8-specialized terminal external permute using fused rounds. +#[inline] +pub fn external_terminal_permute_w8(state: &mut [u64; 8], terminal_constants: &[[u64; 8]]) { + for rc in terminal_constants { + unsafe { + external_round_fused_w8(state, rc); + } + } +} + +/// Dual-lane initial external permute with pre-converted constants. +#[inline] +pub fn external_initial_permute_dual( + lane0: &mut [u64; WIDTH], + lane1: &mut [u64; WIDTH], + constants: &[[u64; WIDTH]], +) { + unsafe { + mds_light_permutation_asm(lane0); + mds_light_permutation_asm(lane1); + } + for rc in constants { + unsafe { + external_round_dual_asm(lane0, lane1, rc); + } + } +} + +/// Dual-lane terminal external permute with pre-converted constants. +#[inline] +pub fn external_terminal_permute_dual( + lane0: &mut [u64; WIDTH], + lane1: &mut [u64; WIDTH], + constants: &[[u64; WIDTH]], +) { + for rc in constants { + unsafe { + external_round_dual_asm(lane0, lane1, rc); + } + } +} + +/// W8-specialized dual-lane initial external permute using fused rounds. +#[inline] +pub fn external_initial_permute_dual_w8( + lane0: &mut [u64; 8], + lane1: &mut [u64; 8], + constants: &[[u64; 8]], +) { + unsafe { + mds_light_permutation_asm(lane0); + mds_light_permutation_asm(lane1); + } + for rc in constants { + unsafe { + external_round_fused_dual_w8(lane0, lane1, rc); + } + } +} + +/// W8-specialized dual-lane terminal external permute using fused rounds. +#[inline] +pub fn external_terminal_permute_dual_w8( + lane0: &mut [u64; 8], + lane1: &mut [u64; 8], + constants: &[[u64; 8]], +) { + for rc in constants { + unsafe { + external_round_fused_dual_w8(lane0, lane1, rc); + } + } +} + +// NEON 2-wide Goldilocks field primitives. +// Each operates on both packed lanes simultaneously using uint64x2_t. + +#[inline(always)] +unsafe fn add_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + unsafe { + let res = vaddq_u64(a, b); + let overflow = vcgtq_u64(a, res); + let adj = vshrq_n_u64::<32>(overflow); + vaddq_u64(res, adj) + } +} + +#[inline(always)] +unsafe fn sub_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + unsafe { + let res = vsubq_u64(a, b); + let underflow = vcgtq_u64(b, a); + let adj = vshrq_n_u64::<32>(underflow); + vsubq_u64(res, adj) + } +} + +#[inline(always)] +unsafe fn double_neon(a: uint64x2_t) -> uint64x2_t { + unsafe { add_neon(a, a) } +} + +#[inline(always)] +unsafe fn div2_neon(x: uint64x2_t) -> uint64x2_t { + unsafe { + let half_p_plus_1 = vdupq_n_u64((P + 1) >> 1); + let one = vdupq_n_u64(1); + let is_odd = vandq_u64(x, one); + let half = vshrq_n_u64::<1>(x); + let mask = vtstq_u64(is_odd, is_odd); + let adj = vandq_u64(mask, half_p_plus_1); + vaddq_u64(half, adj) + } +} + +#[inline(always)] +unsafe fn div4_neon(x: uint64x2_t) -> uint64x2_t { + unsafe { div2_neon(div2_neon(x)) } +} + +#[inline(always)] +unsafe fn div8_neon(x: uint64x2_t) -> uint64x2_t { + unsafe { div2_neon(div4_neon(x)) } +} + +#[inline(always)] +unsafe fn div16_neon(x: uint64x2_t) -> uint64x2_t { + unsafe { div2_neon(div8_neon(x)) } +} + +#[inline(always)] +unsafe fn div32_neon(x: uint64x2_t) -> uint64x2_t { + unsafe { div4_neon(div8_neon(x)) } +} + +/// Compute x * 2^{-32} mod P for each lane using Goldilocks structure. +/// +/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P). +/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P. +#[inline(always)] +unsafe fn div_2_32_neon(x: uint64x2_t) -> uint64x2_t { + unsafe { + let mask_32 = vdupq_n_u64(0xFFFFFFFF); + let hi = vshrq_n_u64::<32>(x); + let lo = vandq_u64(x, mask_32); + let sum = vaddq_u64(hi, lo); + let t = vshlq_n_u64::<32>(lo); + sub_neon(sum, t) + } +} + +#[inline(always)] +unsafe fn apply_mat4_neon(x: &mut [uint64x2_t; 4]) { + unsafe { + let t01 = add_neon(x[0], x[1]); + let t23 = add_neon(x[2], x[3]); + let t0123 = add_neon(t01, t23); + let t01123 = add_neon(t0123, x[1]); + let t01233 = add_neon(t0123, x[3]); + x[3] = add_neon(t01233, double_neon(x[0])); + x[1] = add_neon(t01123, double_neon(x[2])); + x[0] = add_neon(t01123, t01); + x[2] = add_neon(t01233, t23); + } +} + +#[inline(always)] +unsafe fn mds_light_neon(state: &mut [uint64x2_t; WIDTH]) { + unsafe { + let mut i = 0; + while i < WIDTH { + let chunk: &mut [uint64x2_t; 4] = (&mut state[i..i + 4]).try_into().unwrap(); + apply_mat4_neon(chunk); + i += 4; + } + let zero = vdupq_n_u64(0); + let mut sums = [zero; 4]; + for j in (0..WIDTH).step_by(4) { + sums[0] = add_neon(sums[0], state[j]); + sums[1] = add_neon(sums[1], state[j + 1]); + sums[2] = add_neon(sums[2], state[j + 2]); + sums[3] = add_neon(sums[3], state[j + 3]); + } + for (i, elem) in state.iter_mut().enumerate() { + *elem = add_neon(*elem, sums[i % 4]); + } + } +} + +/// Convert separate lane arrays into NEON vector array. +#[inline] +pub fn lanes_to_neon( + lane0: &[u64; WIDTH], + lane1: &[u64; WIDTH], +) -> [uint64x2_t; WIDTH] { + core::array::from_fn(|i| unsafe { + let lo = vcreate_u64(lane0[i]); + let hi = vcreate_u64(lane1[i]); + vcombine_u64(lo, hi) + }) +} + +/// Convert NEON vector array back to separate lane arrays. +#[inline] +pub fn neon_to_lanes( + state_v: &[uint64x2_t; WIDTH], + lane0: &mut [u64; WIDTH], + lane1: &mut [u64; WIDTH], +) { + for i in 0..WIDTH { + unsafe { + lane0[i] = vgetq_lane_u64::<0>(state_v[i]); + lane1[i] = vgetq_lane_u64::<1>(state_v[i]); + } + } +} + +// NEON-based internal permutation: both packed lanes processed +// simultaneously via uint64x2_t for sum tree, diagonal, and writeback. + +#[inline] +pub fn internal_permute_neon_w12(state: &mut [uint64x2_t; 12], constants: &[u64]) { + let mut s0 = state[0]; + for &rc in constants { + unsafe { + let rc_vec = vdupq_n_u64(rc); + s0 = add_neon(s0, rc_vec); + + let s0_0 = vgetq_lane_u64::<0>(s0); + let s0_1 = vgetq_lane_u64::<1>(s0); + let s0_2_0 = mul_asm(s0_0, s0_0); + let s0_2_1 = mul_asm(s0_1, s0_1); + + let sum1 = add_neon(state[1], state[2]); + let sum2 = add_neon(state[3], state[4]); + let sum3 = add_neon(state[5], state[6]); + let sum4 = add_neon(state[7], state[8]); + let sum5 = add_neon(state[9], state[10]); + + let s0_3_0 = mul_asm(s0_2_0, s0_0); + let s0_3_1 = mul_asm(s0_2_1, s0_1); + let s0_4_0 = mul_asm(s0_2_0, s0_2_0); + let s0_4_1 = mul_asm(s0_2_1, s0_2_1); + + let sum12 = add_neon(sum1, sum2); + let sum34 = add_neon(sum3, sum4); + let sum511 = add_neon(sum5, state[11]); + + let d1 = state[1]; + let d2 = double_neon(state[2]); + let d3 = div2_neon(state[3]); + let d4 = add_neon(double_neon(state[4]), state[4]); + + let sum1234 = add_neon(sum12, sum34); + + let d5 = double_neon(double_neon(state[5])); + let d6 = div2_neon(state[6]); + let d7 = add_neon(double_neon(state[7]), state[7]); + let d8 = double_neon(double_neon(state[8])); + + let sum_hi = add_neon(sum1234, sum511); + + let d9 = div4_neon(state[9]); + let d10 = div4_neon(state[10]); + let d11 = div8_neon(state[11]); + + let s0_7_0 = mul_asm(s0_3_0, s0_4_0); + let s0_7_1 = mul_asm(s0_3_1, s0_4_1); + let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1)); + + let sum = add_neon(sum_hi, s0_7); + s0 = sub_neon(sum_hi, s0_7); + + state[1] = add_neon(d1, sum); + state[2] = add_neon(d2, sum); + state[3] = add_neon(d3, sum); + state[4] = add_neon(d4, sum); + state[5] = add_neon(d5, sum); + state[6] = sub_neon(sum, d6); + state[7] = sub_neon(sum, d7); + state[8] = sub_neon(sum, d8); + state[9] = add_neon(d9, sum); + state[10] = sub_neon(sum, d10); + state[11] = add_neon(d11, sum); + } + } + state[0] = s0; +} + +#[inline] +pub fn internal_permute_neon_w16(state: &mut [uint64x2_t; 16], constants: &[u64]) { + let mut s0 = state[0]; + for &rc in constants { + unsafe { + let rc_vec = vdupq_n_u64(rc); + s0 = add_neon(s0, rc_vec); + + let s0_0 = vgetq_lane_u64::<0>(s0); + let s0_1 = vgetq_lane_u64::<1>(s0); + let s0_2_0 = mul_asm(s0_0, s0_0); + let s0_2_1 = mul_asm(s0_1, s0_1); + + let sum1 = add_neon(state[1], state[2]); + let sum2 = add_neon(state[3], state[4]); + let sum3 = add_neon(state[5], state[6]); + let sum4 = add_neon(state[7], state[8]); + let sum5 = add_neon(state[9], state[10]); + let sum6 = add_neon(state[11], state[12]); + let sum7 = add_neon(state[13], state[14]); + + let s0_3_0 = mul_asm(s0_2_0, s0_0); + let s0_3_1 = mul_asm(s0_2_1, s0_1); + let s0_4_0 = mul_asm(s0_2_0, s0_2_0); + let s0_4_1 = mul_asm(s0_2_1, s0_2_1); + + let sum12 = add_neon(sum1, sum2); + let sum34 = add_neon(sum3, sum4); + let sum56 = add_neon(sum5, sum6); + let sum715 = add_neon(sum7, state[15]); + + let sum1234 = add_neon(sum12, sum34); + let sum56715 = add_neon(sum56, sum715); + let sum_hi = add_neon(sum1234, sum56715); + + let d1 = state[1]; + let d2 = double_neon(state[2]); + let d3 = div2_neon(state[3]); + let d4 = add_neon(double_neon(state[4]), state[4]); + let d5 = double_neon(double_neon(state[5])); + let d6 = div2_neon(state[6]); + let d7 = add_neon(double_neon(state[7]), state[7]); + let d8 = double_neon(double_neon(state[8])); + + let d9 = div8_neon(state[9]); + let d10 = div16_neon(state[10]); + let d11 = div32_neon(state[11]); + let d12 = div8_neon(state[12]); + let d13 = div16_neon(state[13]); + let d14 = div32_neon(state[14]); + let d15 = div_2_32_neon(state[15]); + + let s0_7_0 = mul_asm(s0_3_0, s0_4_0); + let s0_7_1 = mul_asm(s0_3_1, s0_4_1); + let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1)); + + let sum = add_neon(sum_hi, s0_7); + s0 = sub_neon(sum_hi, s0_7); + + state[1] = add_neon(d1, sum); + state[2] = add_neon(d2, sum); + state[3] = add_neon(d3, sum); + state[4] = add_neon(d4, sum); + state[5] = add_neon(d5, sum); + state[6] = sub_neon(sum, d6); + state[7] = sub_neon(sum, d7); + state[8] = sub_neon(sum, d8); + state[9] = add_neon(d9, sum); + state[10] = add_neon(d10, sum); + state[11] = add_neon(d11, sum); + state[12] = sub_neon(sum, d12); + state[13] = sub_neon(sum, d13); + state[14] = sub_neon(sum, d14); + state[15] = add_neon(d15, sum); + } + } + state[0] = s0; +} + +#[inline] +pub fn internal_permute_neon( + state: &mut [uint64x2_t; WIDTH], + diag: &[u64; WIDTH], + constants: &[u64], +) { + let mut s0 = state[0]; + for &rc in constants { + unsafe { + let rc_vec = vdupq_n_u64(rc); + s0 = add_neon(s0, rc_vec); + + let s0_0 = vgetq_lane_u64::<0>(s0); + let s0_1 = vgetq_lane_u64::<1>(s0); + let s0_2_0 = mul_asm(s0_0, s0_0); + let s0_2_1 = mul_asm(s0_1, s0_1); + let s0_3_0 = mul_asm(s0_2_0, s0_0); + let s0_3_1 = mul_asm(s0_2_1, s0_1); + let s0_4_0 = mul_asm(s0_2_0, s0_2_0); + let s0_4_1 = mul_asm(s0_2_1, s0_2_1); + let s0_7_0 = mul_asm(s0_3_0, s0_4_0); + let s0_7_1 = mul_asm(s0_3_1, s0_4_1); + let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1)); + + let zero = vdupq_n_u64(0); + let mut sum_hi = zero; + for &s in state.iter().skip(1) { + sum_hi = add_neon(sum_hi, s); + } + + let sum = add_neon(sum_hi, s0_7); + s0 = vcombine_u64( + vcreate_u64(mul_add_asm(s0_7_0, diag[0], vgetq_lane_u64::<0>(sum))), + vcreate_u64(mul_add_asm(s0_7_1, diag[0], vgetq_lane_u64::<1>(sum))), + ); + + for i in 1..WIDTH { + let s_0 = mul_add_asm( + vgetq_lane_u64::<0>(state[i]), + diag[i], + vgetq_lane_u64::<0>(sum), + ); + let s_1 = mul_add_asm( + vgetq_lane_u64::<1>(state[i]), + diag[i], + vgetq_lane_u64::<1>(sum), + ); + state[i] = vcombine_u64(vcreate_u64(s_0), vcreate_u64(s_1)); + } + } + } + state[0] = s0; +} + +// NEON-based external round: S-box stays scalar, MDS uses NEON. + +#[inline(always)] +unsafe fn sbox_neon(state: &mut [uint64x2_t; WIDTH]) { + unsafe { + let mut x2_0 = [0u64; WIDTH]; + let mut x2_1 = [0u64; WIDTH]; + for i in 0..WIDTH { + let a = vgetq_lane_u64::<0>(state[i]); + let b = vgetq_lane_u64::<1>(state[i]); + x2_0[i] = mul_asm(a, a); + x2_1[i] = mul_asm(b, b); + } + let mut x3_0 = [0u64; WIDTH]; + let mut x3_1 = [0u64; WIDTH]; + let mut x4_0 = [0u64; WIDTH]; + let mut x4_1 = [0u64; WIDTH]; + for i in 0..WIDTH { + let a = vgetq_lane_u64::<0>(state[i]); + let b = vgetq_lane_u64::<1>(state[i]); + x3_0[i] = mul_asm(x2_0[i], a); + x3_1[i] = mul_asm(x2_1[i], b); + x4_0[i] = mul_asm(x2_0[i], x2_0[i]); + x4_1[i] = mul_asm(x2_1[i], x2_1[i]); + } + for i in 0..WIDTH { + let r0 = mul_asm(x3_0[i], x4_0[i]); + let r1 = mul_asm(x3_1[i], x4_1[i]); + state[i] = vcombine_u64(vcreate_u64(r0), vcreate_u64(r1)); + } + } +} + +#[inline(always)] +unsafe fn external_round_neon( + state: &mut [uint64x2_t; WIDTH], + rc: &[u64; WIDTH], +) { + unsafe { + for i in 0..WIDTH { + let rc_vec = vdupq_n_u64(rc[i]); + state[i] = add_neon(state[i], rc_vec); + } + sbox_neon(state); + mds_light_neon(state); + } +} + +/// NEON initial external permute. +#[inline] +pub fn external_initial_neon( + state: &mut [uint64x2_t; WIDTH], + constants: &[[u64; WIDTH]], +) { + unsafe { + mds_light_neon(state); + } + for rc in constants { + unsafe { + external_round_neon(state, rc); + } + } +} + +/// NEON terminal external permute. +#[inline] +pub fn external_terminal_neon( + state: &mut [uint64x2_t; WIDTH], + constants: &[[u64; WIDTH]], +) { + for rc in constants { + unsafe { + external_round_neon(state, rc); + } + } +} + +#[cfg(test)] +mod tests { + use alloc::vec::Vec; + + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + use p3_poseidon2::{MDSMat4, matmul_internal, mds_light_permutation}; + use proptest::prelude::*; + use rand::rngs::SmallRng; + use rand::{RngExt, SeedableRng}; + + use super::*; + use crate::{ + Goldilocks, MATRIX_DIAG_8_GOLDILOCKS, MATRIX_DIAG_12_GOLDILOCKS, MATRIX_DIAG_16_GOLDILOCKS, + MATRIX_DIAG_20_GOLDILOCKS, + }; + + type F = Goldilocks; + + /// Reduce a raw u64 to its canonical Goldilocks representative. + fn canon(x: u64) -> u64 { + F::new(x).as_canonical_u64() + } + + /// Pack two u64 lanes into a single NEON vector. + unsafe fn make_neon(a: u64, b: u64) -> uint64x2_t { + unsafe { vcombine_u64(vcreate_u64(a), vcreate_u64(b)) } + } + + /// Extract both u64 lanes from a NEON vector. + unsafe fn read_neon(v: uint64x2_t) -> (u64, u64) { + unsafe { (vgetq_lane_u64::<0>(v), vgetq_lane_u64::<1>(v)) } + } + + proptest! { + #[test] + fn test_sub_asm(a: u64, b: u64) { + // Compute a - b using the standard field implementation. + let expected = (F::new(a) - F::new(b)).as_canonical_u64(); + + // The ASM version should give the same canonical result. + let got = canon(unsafe { sub_asm(a, b) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_double_asm(a: u64) { + // Doubling is just a + a in the field. + let expected = (F::new(a) + F::new(a)).as_canonical_u64(); + + // The ASM shortcut should match. + let got = canon(unsafe { double_asm(a) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_div2_asm(x: u64) { + // Dividing by 2 is one halving in the field. + let expected = F::new(x).halve().as_canonical_u64(); + + let got = canon(unsafe { div2_asm(x) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_div4_asm(x: u64) { + // Dividing by 4 is two halvings. + let expected = F::new(x).halve().halve().as_canonical_u64(); + + let got = canon(unsafe { div4_asm(x) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_div8_asm(x: u64) { + // Dividing by 8 is three halvings. + let expected = F::new(x).halve().halve().halve().as_canonical_u64(); + + let got = canon(unsafe { div8_asm(x) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_div16_asm(x: u64) { + // Dividing by 16 is four halvings. + let expected = F::new(x).halve().halve().halve().halve().as_canonical_u64(); + + let got = canon(unsafe { div16_asm(x) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_div32_asm(x: u64) { + // Dividing by 32 is five halvings. + let expected = F::new(x) + .halve().halve().halve().halve().halve() + .as_canonical_u64(); + + let got = canon(unsafe { div32_asm(x) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_div_2_32_asm(x: u64) { + // Dividing by 2^32: apply halve 32 times as reference. + let mut v = F::new(x); + for _ in 0..32 { + v = v.halve(); + } + let expected = v.as_canonical_u64(); + + let got = canon(unsafe { div_2_32_asm(x) }); + prop_assert_eq!(got, expected); + } + + #[test] + fn test_apply_mat4_asm(x0: u64, x1: u64, x2: u64, x3: u64) { + // Build field elements from the raw inputs. + let f = [F::new(x0), F::new(x1), F::new(x2), F::new(x3)]; + + // The [2,3,1,1] circulant matrix rows. + let two = F::TWO; + let three = two + F::ONE; + let e0 = two * f[0] + three * f[1] + f[2] + f[3]; + let e1 = f[0] + two * f[1] + three * f[2] + f[3]; + let e2 = f[0] + f[1] + two * f[2] + three * f[3]; + let e3 = three * f[0] + f[1] + f[2] + two * f[3]; + + // Run the ASM version on raw u64s. + let mut state = [x0, x1, x2, x3]; + unsafe { apply_mat4_asm(&mut state); } + + // Each slot must match the field-level reference. + prop_assert_eq!(canon(state[0]), e0.as_canonical_u64()); + prop_assert_eq!(canon(state[1]), e1.as_canonical_u64()); + prop_assert_eq!(canon(state[2]), e2.as_canonical_u64()); + prop_assert_eq!(canon(state[3]), e3.as_canonical_u64()); + } + + #[test] + fn test_mds_light_permutation_asm_w8(vals in prop::array::uniform8(any::())) { + // Build field-level state and apply the generic MDS. + let mut state_generic: [F; 8] = vals.map(F::new); + mds_light_permutation(&mut state_generic, &MDSMat4); + + // Run the ASM version on the same raw values. + let mut state_asm = vals; + unsafe { mds_light_permutation_asm(&mut state_asm); } + + // Every element must agree. + for i in 0..8 { + prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64()); + } + } + + #[test] + fn test_mds_light_permutation_asm_w12(vals in prop::array::uniform12(any::())) { + let mut state_generic: [F; 12] = vals.map(F::new); + mds_light_permutation(&mut state_generic, &MDSMat4); + + let mut state_asm = vals; + unsafe { mds_light_permutation_asm(&mut state_asm); } + + for i in 0..12 { + prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64()); + } + } + + #[test] + fn test_mds_light_permutation_asm_w16(vals in prop::array::uniform16(any::())) { + let mut state_generic: [F; 16] = vals.map(F::new); + mds_light_permutation(&mut state_generic, &MDSMat4); + + let mut state_asm = vals; + unsafe { mds_light_permutation_asm(&mut state_asm); } + + for i in 0..16 { + prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64()); + } + } + + #[test] + fn test_sbox_layer_asm(vals in prop::array::uniform8(any::())) { + // Apply the ASM S-box to a copy of the input. + let mut state = vals; + unsafe { sbox_layer_asm(&mut state); } + + // Verify each element is x^7 = x^3 * x^4. + for i in 0..8 { + let x = F::new(vals[i]); + let x2 = x * x; + let x3 = x2 * x; + let x4 = x2 * x2; + let x7 = x3 * x4; + prop_assert_eq!(canon(state[i]), x7.as_canonical_u64()); + } + } + + #[test] + fn test_external_round_asm( + vals in prop::array::uniform8(any::()), + rc in prop::array::uniform8(any::()), + ) { + // Build reference: add round constants, apply x^7, then MDS. + let mut expected: [F; 8] = core::array::from_fn(|i| F::new(vals[i]) + F::new(rc[i])); + for x in expected.iter_mut() { + let x2 = *x * *x; + let x3 = x2 * *x; + let x4 = x2 * x2; + *x = x3 * x4; + } + mds_light_permutation(&mut expected, &MDSMat4); + + // Run the ASM external round. + let mut state = vals; + unsafe { external_round_asm(&mut state, &rc); } + + for i in 0..8 { + prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); + } + } + + #[test] + fn test_sbox_layer_dual_asm( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + ) { + // Run sbox on each lane independently as reference. + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + sbox_layer_asm(&mut ref0); + sbox_layer_asm(&mut ref1); + } + + // The dual-lane version processes both at once. + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { sbox_layer_dual_asm(&mut s0, &mut s1); } + + // Both lanes must match their single-lane reference. + for i in 0..8 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + + #[test] + fn test_external_round_dual_asm( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + rc in prop::array::uniform8(any::()), + ) { + // Run external round on each lane independently as reference. + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + external_round_asm(&mut ref0, &rc); + external_round_asm(&mut ref1, &rc); + } + + // The dual-lane version processes both at once. + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { external_round_dual_asm(&mut s0, &mut s1, &rc); } + + for i in 0..8 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + + #[test] + fn test_external_round_fused_w8( + vals in prop::array::uniform8(any::()), + rc in prop::array::uniform8(any::()), + ) { + // The generic external round is the reference. + let mut ref_state = vals; + unsafe { external_round_asm(&mut ref_state, &rc); } + + // The fused W8 version should produce the same output. + let mut fused_state = vals; + unsafe { external_round_fused_w8(&mut fused_state, &rc); } + + for i in 0..8 { + prop_assert_eq!(canon(fused_state[i]), canon(ref_state[i])); + } + } + + #[test] + fn test_external_round_fused_dual_w8( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + rc in prop::array::uniform8(any::()), + ) { + // Run the fused round on each lane independently as reference. + let mut ref0 = vals0; + let mut ref1 = vals1; + unsafe { + external_round_fused_w8(&mut ref0, &rc); + external_round_fused_w8(&mut ref1, &rc); + } + + // The dual version processes both at once. + let mut s0 = vals0; + let mut s1 = vals1; + unsafe { external_round_fused_dual_w8(&mut s0, &mut s1, &rc); } + + for i in 0..8 { + prop_assert_eq!(canon(s0[i]), canon(ref0[i])); + prop_assert_eq!(canon(s1[i]), canon(ref1[i])); + } + } + } + + fn test_internal_round_matches(diag: [F; WIDTH]) { + let mut rng = SmallRng::seed_from_u64(12345); + + // Build random state and constants. + let mut state_asm: [F; WIDTH] = rng.random(); + let mut state_generic = state_asm; + + let internal_constants: [F; 22] = rng.random(); + let constants_raw: Vec = internal_constants.iter().map(|c| c.value).collect(); + let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); + + // Run the ASM internal permute on raw u64 representation. + let state_raw: &mut [u64; WIDTH] = + unsafe { &mut *(&mut state_asm as *mut [F; WIDTH] as *mut [u64; WIDTH]) }; + internal_permute_state_asm(state_raw, &diag_raw, &constants_raw); + + // Build the same result via field-level ops: add RC, S-box on s0, matmul. + for &rc in internal_constants.iter() { + state_generic[0] += rc; + let s = state_generic[0]; + let s2 = s * s; + let s3 = s2 * s; + let s4 = s2 * s2; + state_generic[0] = s3 * s4; + matmul_internal(&mut state_generic, diag); + } + + for i in 0..WIDTH { + assert_eq!( + state_asm[i].as_canonical_u64(), + state_generic[i].as_canonical_u64(), + "mismatch at index {i}" + ); + } + } + + #[test] + fn test_internal_round_width_8() { + test_internal_round_matches(MATRIX_DIAG_8_GOLDILOCKS); + } + + #[test] + fn test_internal_round_width_12() { + test_internal_round_matches(MATRIX_DIAG_12_GOLDILOCKS); + } + + #[test] + fn test_internal_round_width_16() { + test_internal_round_matches(MATRIX_DIAG_16_GOLDILOCKS); + } + + #[test] + fn test_internal_round_width_20() { + test_internal_round_matches(MATRIX_DIAG_20_GOLDILOCKS); + } + + fn test_specialized_matches_generic( + diag: [F; WIDTH], + specialized_fn: fn(&mut [u64; WIDTH], &[u64]), + ) { + let mut rng = SmallRng::seed_from_u64(42); + + let internal_constants: Vec = (0..22).map(|_| rng.random()).collect(); + let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); + + // Run both the specialized and generic versions on several random states. + for _ in 0..8 { + let mut state_specialized: [u64; WIDTH] = rng.random(); + let mut state_generic = state_specialized; + + specialized_fn(&mut state_specialized, &internal_constants); + internal_permute_state_asm(&mut state_generic, &diag_raw, &internal_constants); + + for i in 0..WIDTH { + assert_eq!(canon(state_specialized[i]), canon(state_generic[i])); + } + } + } + + #[test] + fn test_specialized_w8_matches_generic() { + test_specialized_matches_generic(MATRIX_DIAG_8_GOLDILOCKS, internal_permute_state_asm_w8); + } + + #[test] + fn test_specialized_w12_matches_generic() { + test_specialized_matches_generic(MATRIX_DIAG_12_GOLDILOCKS, internal_permute_state_asm_w12); + } + + #[test] + fn test_specialized_w16_matches_generic() { + test_specialized_matches_generic(MATRIX_DIAG_16_GOLDILOCKS, internal_permute_state_asm_w16); + } + + #[allow(clippy::type_complexity)] + fn test_dual_matches_single( + diag: [F; WIDTH], + single_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]), + dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64; WIDTH], &[u64]), + ) { + let mut rng = SmallRng::seed_from_u64(77); + + let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); + let constants: Vec = (0..22).map(|_| rng.random()).collect(); + + // Run single-lane on each lane independently. + let mut lane0: [u64; WIDTH] = rng.random(); + let mut lane1: [u64; WIDTH] = rng.random(); + let mut ref0 = lane0; + let mut ref1 = lane1; + + single_fn(&mut ref0, &diag_raw, &constants); + single_fn(&mut ref1, &diag_raw, &constants); + + // Run dual-lane on both at once. Must match. + dual_fn(&mut lane0, &mut lane1, &diag_raw, &constants); + + for i in 0..WIDTH { + assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}"); + assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}"); + } + } + + #[test] + fn test_internal_permute_split_dual_w8() { + test_dual_matches_single( + MATRIX_DIAG_8_GOLDILOCKS, + internal_permute_state_asm, + internal_permute_split_dual, + ); + } + + #[test] + fn test_internal_permute_split_dual_w12() { + test_dual_matches_single( + MATRIX_DIAG_12_GOLDILOCKS, + internal_permute_state_asm, + internal_permute_split_dual, + ); + } + + #[test] + fn test_internal_permute_split_dual_w16() { + test_dual_matches_single( + MATRIX_DIAG_16_GOLDILOCKS, + internal_permute_state_asm, + internal_permute_split_dual, + ); + } + + fn test_specialized_dual_matches_generic_dual( + diag: [F; WIDTH], + specialized_dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64]), + ) { + let mut rng = SmallRng::seed_from_u64(99); + + let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); + let constants: Vec = (0..22).map(|_| rng.random()).collect(); + + // The generic dual-lane version is the reference. + let mut lane0: [u64; WIDTH] = rng.random(); + let mut lane1: [u64; WIDTH] = rng.random(); + let mut ref0 = lane0; + let mut ref1 = lane1; + + internal_permute_split_dual(&mut ref0, &mut ref1, &diag_raw, &constants); + + // The specialized version must match. + specialized_dual_fn(&mut lane0, &mut lane1, &constants); + + for i in 0..WIDTH { + assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}"); + assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}"); + } + } + + #[test] + fn test_specialized_dual_w8_matches_generic() { + test_specialized_dual_matches_generic_dual( + MATRIX_DIAG_8_GOLDILOCKS, + internal_permute_split_dual_w8, + ); + } + + #[test] + fn test_specialized_dual_w12_matches_generic() { + test_specialized_dual_matches_generic_dual( + MATRIX_DIAG_12_GOLDILOCKS, + internal_permute_split_dual_w12, + ); + } + + #[test] + fn test_specialized_dual_w16_matches_generic() { + test_specialized_dual_matches_generic_dual( + MATRIX_DIAG_16_GOLDILOCKS, + internal_permute_split_dual_w16, + ); + } + + fn make_round_constants(seed: u64, num_rounds: usize) -> Vec<[u64; WIDTH]> { + let mut rng = SmallRng::seed_from_u64(seed); + (0..num_rounds).map(|_| rng.random()).collect() + } + + proptest! { + #[test] + fn test_external_initial_permute_state_asm( + vals in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(42, 4); + + // Reference: apply MDS once, then each external round manually. + let mut expected = vals; + unsafe { mds_light_permutation_asm(&mut expected); } + for rc in &constants { + unsafe { external_round_asm(&mut expected, rc); } + } + + // The composed function should give the same result. + let mut got = vals; + external_initial_permute_state_asm(&mut got, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(got[i]), canon(expected[i])); + } + } + + #[test] + fn test_external_terminal_permute_state_asm( + vals in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(43, 4); + + // Reference: just the external rounds, no initial MDS. + let mut expected = vals; + for rc in &constants { + unsafe { external_round_asm(&mut expected, rc); } + } + + let mut got = vals; + external_terminal_permute_state_asm(&mut got, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(got[i]), canon(expected[i])); + } + } + + #[test] + fn test_external_initial_permute_w8( + vals in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(44, 4); + + // The generic version is the reference. + let mut expected = vals; + external_initial_permute_state_asm(&mut expected, &constants); + + // The W8-specialized version must match. + let mut got = vals; + external_initial_permute_w8(&mut got, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(got[i]), canon(expected[i])); + } + } + + #[test] + fn test_external_terminal_permute_w8( + vals in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(45, 4); + + let mut expected = vals; + external_terminal_permute_state_asm(&mut expected, &constants); + + let mut got = vals; + external_terminal_permute_w8(&mut got, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(got[i]), canon(expected[i])); + } + } + + #[test] + fn test_external_initial_permute_dual( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(46, 4); + + // Run single-lane on each independently as reference. + let mut ref0 = vals0; + let mut ref1 = vals1; + external_initial_permute_state_asm(&mut ref0, &constants); + external_initial_permute_state_asm(&mut ref1, &constants); + + // The dual version processes both at once. + let mut l0 = vals0; + let mut l1 = vals1; + external_initial_permute_dual(&mut l0, &mut l1, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(l0[i]), canon(ref0[i])); + prop_assert_eq!(canon(l1[i]), canon(ref1[i])); + } + } + + #[test] + fn test_external_terminal_permute_dual( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(47, 4); + + let mut ref0 = vals0; + let mut ref1 = vals1; + external_terminal_permute_state_asm(&mut ref0, &constants); + external_terminal_permute_state_asm(&mut ref1, &constants); + + let mut l0 = vals0; + let mut l1 = vals1; + external_terminal_permute_dual(&mut l0, &mut l1, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(l0[i]), canon(ref0[i])); + prop_assert_eq!(canon(l1[i]), canon(ref1[i])); + } + } + + #[test] + fn test_external_initial_permute_dual_w8( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(48, 4); + + // The generic dual version is the reference. + let mut ref0 = vals0; + let mut ref1 = vals1; + external_initial_permute_dual(&mut ref0, &mut ref1, &constants); + + // The W8-specialized dual must match. + let mut l0 = vals0; + let mut l1 = vals1; + external_initial_permute_dual_w8(&mut l0, &mut l1, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(l0[i]), canon(ref0[i])); + prop_assert_eq!(canon(l1[i]), canon(ref1[i])); + } + } + + #[test] + fn test_external_terminal_permute_dual_w8( + vals0 in prop::array::uniform8(any::()), + vals1 in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(49, 4); + + let mut ref0 = vals0; + let mut ref1 = vals1; + external_terminal_permute_dual(&mut ref0, &mut ref1, &constants); + + let mut l0 = vals0; + let mut l1 = vals1; + external_terminal_permute_dual_w8(&mut l0, &mut l1, &constants); + + for i in 0..8 { + prop_assert_eq!(canon(l0[i]), canon(ref0[i])); + prop_assert_eq!(canon(l1[i]), canon(ref1[i])); + } + } + + #[test] + fn test_add_neon(a0: u64, a1: u64, b0: u64, b1: u64) { + unsafe { + // Pack two lanes into NEON vectors, add, then read back. + let (r0, r1) = read_neon(add_neon(make_neon(a0, a1), make_neon(b0, b1))); + + // Each lane must match its scalar add_asm equivalent. + prop_assert_eq!(canon(r0), canon(add_asm(a0, b0))); + prop_assert_eq!(canon(r1), canon(add_asm(a1, b1))); + } + } + + #[test] + fn test_sub_neon(a0: u64, a1: u64, b0: u64, b1: u64) { + unsafe { + let (r0, r1) = read_neon(sub_neon(make_neon(a0, a1), make_neon(b0, b1))); + + prop_assert_eq!(canon(r0), canon(sub_asm(a0, b0))); + prop_assert_eq!(canon(r1), canon(sub_asm(a1, b1))); + } + } + + #[test] + fn test_double_neon(a0: u64, a1: u64) { + unsafe { + let (r0, r1) = read_neon(double_neon(make_neon(a0, a1))); + + prop_assert_eq!(canon(r0), canon(double_asm(a0))); + prop_assert_eq!(canon(r1), canon(double_asm(a1))); + } + } + + #[test] + fn test_div2_neon(a0: u64, a1: u64) { + unsafe { + let (r0, r1) = read_neon(div2_neon(make_neon(a0, a1))); + + prop_assert_eq!(canon(r0), canon(div2_asm(a0))); + prop_assert_eq!(canon(r1), canon(div2_asm(a1))); + } + } + + #[test] + fn test_div4_neon(a0: u64, a1: u64) { + unsafe { + let (r0, r1) = read_neon(div4_neon(make_neon(a0, a1))); + + prop_assert_eq!(canon(r0), canon(div4_asm(a0))); + prop_assert_eq!(canon(r1), canon(div4_asm(a1))); + } + } + + #[test] + fn test_div8_neon(a0: u64, a1: u64) { + unsafe { + let (r0, r1) = read_neon(div8_neon(make_neon(a0, a1))); + + prop_assert_eq!(canon(r0), canon(div8_asm(a0))); + prop_assert_eq!(canon(r1), canon(div8_asm(a1))); + } + } + + #[test] + fn test_div16_neon(a0: u64, a1: u64) { + unsafe { + let (r0, r1) = read_neon(div16_neon(make_neon(a0, a1))); + + prop_assert_eq!(canon(r0), canon(div16_asm(a0))); + prop_assert_eq!(canon(r1), canon(div16_asm(a1))); + } + } + + #[test] + fn test_div32_neon(a0: u64, a1: u64) { + unsafe { + let (r0, r1) = read_neon(div32_neon(make_neon(a0, a1))); + + prop_assert_eq!(canon(r0), canon(div32_asm(a0))); + prop_assert_eq!(canon(r1), canon(div32_asm(a1))); + } + } + + #[test] + fn test_div_2_32_neon(a0: u64, a1: u64) { + unsafe { + let (r0, r1) = read_neon(div_2_32_neon(make_neon(a0, a1))); + + prop_assert_eq!(canon(r0), canon(div_2_32_asm(a0))); + prop_assert_eq!(canon(r1), canon(div_2_32_asm(a1))); + } + } + + #[test] + fn test_apply_mat4_neon( + a0: u64, a1: u64, a2: u64, a3: u64, + b0: u64, b1: u64, b2: u64, b3: u64, + ) { + unsafe { + // Scalar reference: run apply_mat4_asm on each lane separately. + let mut lane_a = [a0, a1, a2, a3]; + let mut lane_b = [b0, b1, b2, b3]; + apply_mat4_asm(&mut lane_a); + apply_mat4_asm(&mut lane_b); + + // NEON version: pack both lanes into vectors, apply, read back. + let mut neon_state = [ + make_neon(a0, b0), + make_neon(a1, b1), + make_neon(a2, b2), + make_neon(a3, b3), + ]; + apply_mat4_neon(&mut neon_state); + + for i in 0..4 { + let (r0, r1) = read_neon(neon_state[i]); + prop_assert_eq!(canon(r0), canon(lane_a[i])); + prop_assert_eq!(canon(r1), canon(lane_b[i])); + } + } + } + + #[test] + fn test_mds_light_neon_w8( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + unsafe { + // Run scalar MDS on each lane independently. + let mut ref_a = lane_a; + let mut ref_b = lane_b; + mds_light_permutation_asm(&mut ref_a); + mds_light_permutation_asm(&mut ref_b); + + // Pack both lanes into NEON vectors and run the NEON MDS. + let mut neon_state: [uint64x2_t; 8] = + core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i])); + mds_light_neon(&mut neon_state); + + // Each lane of each vector must match the scalar reference. + for i in 0..8 { + let (r0, r1) = read_neon(neon_state[i]); + prop_assert_eq!(canon(r0), canon(ref_a[i])); + prop_assert_eq!(canon(r1), canon(ref_b[i])); + } + } + } + + #[test] + fn test_sbox_neon( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + unsafe { + // Scalar reference on each lane. + let mut ref_a = lane_a; + let mut ref_b = lane_b; + sbox_layer_asm(&mut ref_a); + sbox_layer_asm(&mut ref_b); + + // NEON version on packed lanes. + let mut neon_state: [uint64x2_t; 8] = + core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i])); + sbox_neon(&mut neon_state); + + for i in 0..8 { + let (r0, r1) = read_neon(neon_state[i]); + prop_assert_eq!(canon(r0), canon(ref_a[i])); + prop_assert_eq!(canon(r1), canon(ref_b[i])); + } + } + } + + #[test] + fn test_external_round_neon( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + rc in prop::array::uniform8(any::()), + ) { + unsafe { + // Scalar reference on each lane. + let mut ref_a = lane_a; + let mut ref_b = lane_b; + external_round_asm(&mut ref_a, &rc); + external_round_asm(&mut ref_b, &rc); + + // NEON version on packed lanes. + let mut neon_state: [uint64x2_t; 8] = + core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i])); + external_round_neon(&mut neon_state, &rc); + + for i in 0..8 { + let (r0, r1) = read_neon(neon_state[i]); + prop_assert_eq!(canon(r0), canon(ref_a[i])); + prop_assert_eq!(canon(r1), canon(ref_b[i])); + } + } + } + + #[test] + fn test_lanes_roundtrip( + lane0 in prop::array::uniform8(any::()), + lane1 in prop::array::uniform8(any::()), + ) { + // Pack two lane arrays into NEON vectors. + let packed = lanes_to_neon(&lane0, &lane1); + + // Unpack back into separate arrays. + let mut out0 = [0u64; 8]; + let mut out1 = [0u64; 8]; + neon_to_lanes(&packed, &mut out0, &mut out1); + + // Must recover the original values. + prop_assert_eq!(out0, lane0); + prop_assert_eq!(out1, lane1); + } + + #[test] + fn test_external_initial_neon( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(50, 4); + + // Scalar reference on each lane. + let mut ref_a = lane_a; + let mut ref_b = lane_b; + external_initial_permute_state_asm(&mut ref_a, &constants); + external_initial_permute_state_asm(&mut ref_b, &constants); + + // NEON version on packed lanes. + let mut neon_state = lanes_to_neon(&lane_a, &lane_b); + external_initial_neon(&mut neon_state, &constants); + + let mut out_a = [0u64; 8]; + let mut out_b = [0u64; 8]; + neon_to_lanes(&neon_state, &mut out_a, &mut out_b); + + for i in 0..8 { + prop_assert_eq!(canon(out_a[i]), canon(ref_a[i])); + prop_assert_eq!(canon(out_b[i]), canon(ref_b[i])); + } + } + + #[test] + fn test_external_terminal_neon( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + let constants = make_round_constants::<8>(51, 4); + + let mut ref_a = lane_a; + let mut ref_b = lane_b; + external_terminal_permute_state_asm(&mut ref_a, &constants); + external_terminal_permute_state_asm(&mut ref_b, &constants); + + let mut neon_state = lanes_to_neon(&lane_a, &lane_b); + external_terminal_neon(&mut neon_state, &constants); + + let mut out_a = [0u64; 8]; + let mut out_b = [0u64; 8]; + neon_to_lanes(&neon_state, &mut out_a, &mut out_b); + + for i in 0..8 { + prop_assert_eq!(canon(out_a[i]), canon(ref_a[i])); + prop_assert_eq!(canon(out_b[i]), canon(ref_b[i])); + } + } + } + + fn test_internal_neon_matches_scalar( + diag: [F; WIDTH], + neon_fn: fn(&mut [uint64x2_t; WIDTH], &[u64]), + scalar_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]), + ) { + let mut rng = SmallRng::seed_from_u64(55); + + let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); + let constants: Vec = (0..22).map(|_| rng.random()).collect(); + + let lane_a: [u64; WIDTH] = rng.random(); + let lane_b: [u64; WIDTH] = rng.random(); + + // Scalar reference on each lane independently. + let mut ref_a = lane_a; + let mut ref_b = lane_b; + scalar_fn(&mut ref_a, &diag_raw, &constants); + scalar_fn(&mut ref_b, &diag_raw, &constants); + + // NEON version packs both lanes and processes them together. + let mut neon_state = lanes_to_neon(&lane_a, &lane_b); + neon_fn(&mut neon_state, &constants); + + let mut out_a = [0u64; WIDTH]; + let mut out_b = [0u64; WIDTH]; + neon_to_lanes(&neon_state, &mut out_a, &mut out_b); + + for i in 0..WIDTH { + assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}"); + assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}"); + } + } + + #[test] + fn test_internal_permute_neon_w12() { + test_internal_neon_matches_scalar( + MATRIX_DIAG_12_GOLDILOCKS, + internal_permute_neon_w12, + internal_permute_state_asm, + ); + } + + #[test] + fn test_internal_permute_neon_w16() { + test_internal_neon_matches_scalar( + MATRIX_DIAG_16_GOLDILOCKS, + internal_permute_neon_w16, + internal_permute_state_asm, + ); + } + + fn test_internal_neon_generic_matches_scalar(diag: [F; WIDTH]) { + let mut rng = SmallRng::seed_from_u64(66); + + let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); + let constants: Vec = (0..22).map(|_| rng.random()).collect(); + + let lane_a: [u64; WIDTH] = rng.random(); + let lane_b: [u64; WIDTH] = rng.random(); + + // Scalar reference. + let mut ref_a = lane_a; + let mut ref_b = lane_b; + internal_permute_state_asm(&mut ref_a, &diag_raw, &constants); + internal_permute_state_asm(&mut ref_b, &diag_raw, &constants); + + // Generic NEON version. + let mut neon_state = lanes_to_neon(&lane_a, &lane_b); + internal_permute_neon(&mut neon_state, &diag_raw, &constants); + + let mut out_a = [0u64; WIDTH]; + let mut out_b = [0u64; WIDTH]; + neon_to_lanes(&neon_state, &mut out_a, &mut out_b); + + for i in 0..WIDTH { + assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}"); + assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}"); + } + } + + #[test] + fn test_internal_permute_neon_generic_w8() { + test_internal_neon_generic_matches_scalar(MATRIX_DIAG_8_GOLDILOCKS); + } + + #[test] + fn test_internal_permute_neon_generic_w12() { + test_internal_neon_generic_matches_scalar(MATRIX_DIAG_12_GOLDILOCKS); + } + + #[test] + fn test_internal_permute_neon_generic_w16() { + test_internal_neon_generic_matches_scalar(MATRIX_DIAG_16_GOLDILOCKS); + } + + #[test] + fn test_internal_permute_neon_generic_w20() { + test_internal_neon_generic_matches_scalar(MATRIX_DIAG_20_GOLDILOCKS); + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs new file mode 100644 index 000000000..3d1951a57 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs @@ -0,0 +1,400 @@ +//! Shared utilities for Goldilocks NEON assembly. + +use core::arch::asm; + +use super::packing::PackedGoldilocksNeon; +use crate::{Goldilocks, P}; + +const EPSILON: u64 = P.wrapping_neg(); // 2^32 - 1 + +// --------------------------------------------------------------------------- +// Scalar field arithmetic (inline assembly) +// --------------------------------------------------------------------------- + +/// Multiply two Goldilocks elements using inline assembly. +/// +/// Computes `a * b mod P` where P = 2^64 - 2^32 + 1. The reduction +/// uses the identity `2^64 = 2^32 - 1 (mod P)` (i.e. EPSILON) to fold +/// the 128-bit product back into a single limb. +#[inline(always)] +pub(super) unsafe fn mul_asm(a: u64, b: u64) -> u64 { + let _lo: u64; + let _hi: u64; + let _t0: u64; + let _t1: u64; + let _t2: u64; + let result: u64; + + unsafe { + asm!( + // Compute 128-bit product: hi:lo = a * b + "mul {lo}, {a}, {b}", + "umulh {hi}, {a}, {b}", + + // Reduce: result = lo - hi_hi + hi_lo * EPSILON + // where hi = hi_hi * 2^32 + hi_lo + + // t0 = lo - (hi >> 32), with borrow detection + "lsr {t0}, {hi}, #32", // t0 = hi >> 32 + "subs {t1}, {lo}, {t0}", // t1 = lo - t0, set flags + "csetm {t2:w}, cc", // t2 = -1 if borrow, 0 otherwise + "sub {t1}, {t1}, {t2}", // Adjust for borrow (subtract EPSILON) + + // t0 = (hi & EPSILON) * EPSILON + "and {t0}, {hi}, {epsilon}", // t0 = hi & EPSILON + "mul {t0}, {t0}, {epsilon}", // t0 = t0 * EPSILON + + // result = t1 + t0, with overflow detection + "adds {result}, {t1}, {t0}", // result = t1 + t0, set flags + "csetm {t2:w}, cs", // t2 = -1 if carry, 0 otherwise + "add {result}, {result}, {t2}", // Add EPSILON on overflow + + a = in(reg) a, + b = in(reg) b, + epsilon = in(reg) EPSILON, + lo = out(reg) _lo, + hi = out(reg) _hi, + t0 = out(reg) _t0, + t1 = out(reg) _t1, + t2 = out(reg) _t2, + result = out(reg) result, + options(pure, nomem, nostack), + ); + } + + result +} + +/// Compute `a * b + c` in the Goldilocks field using inline assembly. +/// +/// Fused multiply-add: forms the 128-bit product `a * b`, adds `c` into +/// the low limb (with carry propagation), then reduces modulo P. +#[inline(always)] +pub(super) unsafe fn mul_add_asm(a: u64, b: u64, c: u64) -> u64 { + let _lo: u64; + let _hi: u64; + let _t0: u64; + let _t1: u64; + let _t2: u64; + let result: u64; + + unsafe { + asm!( + // Compute 128-bit product: hi:lo = a * b + "mul {lo}, {a}, {b}", + "umulh {hi}, {a}, {b}", + + // Accumulate c into the 128-bit product: hi:lo = hi:lo + c + "adds {lo}, {lo}, {c}", + "adc {hi}, {hi}, xzr", + + // Reduce: result = lo - hi_hi + hi_lo * EPSILON + // where hi = hi_hi * 2^32 + hi_lo + + // t0 = lo - (hi >> 32), with borrow detection + "lsr {t0}, {hi}, #32", // t0 = hi >> 32 + "subs {t1}, {lo}, {t0}", // t1 = lo - t0, set flags + "csetm {t2:w}, cc", // t2 = -1 if borrow, 0 otherwise + "sub {t1}, {t1}, {t2}", // Adjust for borrow (subtract EPSILON) + + // t0 = (hi & EPSILON) * EPSILON + "and {t0}, {hi}, {epsilon}", // t0 = hi & EPSILON + "mul {t0}, {t0}, {epsilon}", // t0 = t0 * EPSILON + + // result = t1 + t0, with overflow detection + "adds {result}, {t1}, {t0}", // result = t1 + t0, set flags + "csetm {t2:w}, cs", // t2 = -1 if carry, 0 otherwise + "add {result}, {result}, {t2}", // Add EPSILON on overflow + + a = in(reg) a, + b = in(reg) b, + c = in(reg) c, + epsilon = in(reg) EPSILON, + lo = out(reg) _lo, + hi = out(reg) _hi, + t0 = out(reg) _t0, + t1 = out(reg) _t1, + t2 = out(reg) _t2, + result = out(reg) result, + options(pure, nomem, nostack), + ); + } + + result +} + +/// Add two Goldilocks elements with overflow handling using inline assembly. +/// +/// Computes `a + b mod P`. On overflow (carry out of 64 bits), subtracts +/// P by adding EPSILON (which equals -P mod 2^64, i.e. 2^32 - 1). +#[inline(always)] +pub(super) unsafe fn add_asm(a: u64, b: u64) -> u64 { + let result: u64; + let _adj: u64; + + unsafe { + asm!( + "adds {result}, {a}, {b}", + "csetm {adj:w}, cs", + "add {result}, {result}, {adj}", + a = in(reg) a, + b = in(reg) b, + result = out(reg) result, + adj = out(reg) _adj, + options(pure, nomem, nostack), + ); + } + + result +} + +// --------------------------------------------------------------------------- +// Lane conversion (packed NEON <-> raw u64 arrays) +// --------------------------------------------------------------------------- + +/// Unpack a packed NEON state into two raw `u64` lane arrays. +/// +/// Each packed slot contains two Goldilocks elements (lane 0, lane 1). +/// This function extracts the internal `u64` representation of each +/// element into two separate arrays, one per lane. +/// +/// # Layout +/// +/// ```text +/// packed[i] = (field_elem_a, field_elem_b) +/// +/// lane0[i] = field_elem_a.value (raw u64) +/// lane1[i] = field_elem_b.value (raw u64) +/// ``` +#[inline] +pub(super) fn unpack_lanes( + state: &[PackedGoldilocksNeon; WIDTH], +) -> ([u64; WIDTH], [u64; WIDTH]) { + // Extract the raw u64 representation from each packed slot. + let lane0: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[0].value); + let lane1: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[1].value); + (lane0, lane1) +} + +/// Pack two raw `u64` lane arrays back into a packed NEON state. +/// +/// Each raw value is wrapped into a Goldilocks field element (with +/// reduction modulo P) and paired into a packed slot. +/// +/// # Layout +/// +/// ```text +/// lane0[i], lane1[i] -> packed[i] = (Goldilocks(lane0[i]), Goldilocks(lane1[i])) +/// ``` +#[inline] +pub(super) fn pack_lanes( + state: &mut [PackedGoldilocksNeon; WIDTH], + lane0: &[u64; WIDTH], + lane1: &[u64; WIDTH], +) { + for i in 0..WIDTH { + // Wrap each raw u64 into a field element and pair them. + state[i] = PackedGoldilocksNeon([Goldilocks::new(lane0[i]), Goldilocks::new(lane1[i])]); + } +} + +#[cfg(test)] +mod tests { + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + use proptest::prelude::*; + + use super::*; + + type F = Goldilocks; + + /// Reduce a raw `u64` to its canonical Goldilocks representative. + fn canon(x: u64) -> u64 { + F::new(x).as_canonical_u64() + } + + proptest! { + // ---------------------------------------------------------------- + // Scalar field arithmetic + // ---------------------------------------------------------------- + + /// Verify ASM addition against field addition. + #[test] + fn test_add_asm(a: u64, b: u64) { + let expected = (F::new(a) + F::new(b)).as_canonical_u64(); + let got = canon(unsafe { add_asm(a, b) }); + prop_assert_eq!(got, expected); + } + + /// Verify ASM multiplication against field multiplication. + #[test] + fn test_mul_asm(a: u64, b: u64) { + let expected = (F::new(a) * F::new(b)).as_canonical_u64(); + let got = canon(unsafe { mul_asm(a, b) }); + prop_assert_eq!(got, expected); + } + + /// Verify ASM fused multiply-add against field multiply-add. + #[test] + fn test_mul_add_asm(a: u64, b: u64, c: u64) { + let expected = (F::new(a) * F::new(b) + F::new(c)).as_canonical_u64(); + let got = canon(unsafe { mul_add_asm(a, b, c) }); + prop_assert_eq!(got, expected); + } + + // ---------------------------------------------------------------- + // Unpack: packed state -> two raw u64 lane arrays + // ---------------------------------------------------------------- + + #[test] + fn test_unpack_lanes_w8( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + // Build a packed state from two independent lane arrays. + let packed: [PackedGoldilocksNeon; 8] = + core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); + + // Unpack into raw u64 lane arrays. + let (got0, got1) = unpack_lanes(&packed); + + // Each raw value must be the internal representation of the field element. + for i in 0..8 { + prop_assert_eq!(got0[i], F::new(lane_a[i]).value); + prop_assert_eq!(got1[i], F::new(lane_b[i]).value); + } + } + + #[test] + fn test_unpack_lanes_w12( + lane_a in prop::array::uniform12(any::()), + lane_b in prop::array::uniform12(any::()), + ) { + // Same verification, width 12. + let packed: [PackedGoldilocksNeon; 12] = + core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); + + let (got0, got1) = unpack_lanes(&packed); + + for i in 0..12 { + prop_assert_eq!(got0[i], F::new(lane_a[i]).value); + prop_assert_eq!(got1[i], F::new(lane_b[i]).value); + } + } + + // ---------------------------------------------------------------- + // Pack: two raw u64 lane arrays -> packed state + // ---------------------------------------------------------------- + + #[test] + fn test_pack_lanes_w8( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + // Pack two raw lane arrays into packed state. + let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8]; + pack_lanes(&mut packed, &lane_a, &lane_b); + + // Each packed element must hold the two corresponding field elements. + for i in 0..8 { + prop_assert_eq!(packed[i].0[0], F::new(lane_a[i])); + prop_assert_eq!(packed[i].0[1], F::new(lane_b[i])); + } + } + + #[test] + fn test_pack_lanes_w12( + lane_a in prop::array::uniform12(any::()), + lane_b in prop::array::uniform12(any::()), + ) { + // Same verification, width 12. + let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12]; + pack_lanes(&mut packed, &lane_a, &lane_b); + + for i in 0..12 { + prop_assert_eq!(packed[i].0[0], F::new(lane_a[i])); + prop_assert_eq!(packed[i].0[1], F::new(lane_b[i])); + } + } + + // ---------------------------------------------------------------- + // Roundtrip: pack then unpack recovers canonical values + // ---------------------------------------------------------------- + + #[test] + fn test_roundtrip_pack_unpack_w8( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + // Pack two lane arrays, then unpack them. + let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8]; + pack_lanes(&mut packed, &lane_a, &lane_b); + let (out0, out1) = unpack_lanes(&packed); + + // The canonical form of the recovered values must match. + for i in 0..8 { + prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64()); + prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64()); + } + } + + #[test] + fn test_roundtrip_pack_unpack_w12( + lane_a in prop::array::uniform12(any::()), + lane_b in prop::array::uniform12(any::()), + ) { + // Same roundtrip, width 12. + let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12]; + pack_lanes(&mut packed, &lane_a, &lane_b); + let (out0, out1) = unpack_lanes(&packed); + + for i in 0..12 { + prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64()); + prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64()); + } + } + + // ---------------------------------------------------------------- + // Roundtrip: unpack then pack preserves packed state + // ---------------------------------------------------------------- + + #[test] + fn test_roundtrip_unpack_pack_w8( + lane_a in prop::array::uniform8(any::()), + lane_b in prop::array::uniform8(any::()), + ) { + // Start from a packed state. + let original: [PackedGoldilocksNeon; 8] = + core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); + + // Unpack into raw lanes, then pack back. + let (raw0, raw1) = unpack_lanes(&original); + let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 8]; + pack_lanes(&mut restored, &raw0, &raw1); + + // The restored packed state must equal the original. + for i in 0..8 { + prop_assert_eq!(restored[i].0[0], original[i].0[0]); + prop_assert_eq!(restored[i].0[1], original[i].0[1]); + } + } + + #[test] + fn test_roundtrip_unpack_pack_w12( + lane_a in prop::array::uniform12(any::()), + lane_b in prop::array::uniform12(any::()), + ) { + // Same reverse roundtrip, width 12. + let original: [PackedGoldilocksNeon; 12] = + core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); + + let (raw0, raw1) = unpack_lanes(&original); + let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 12]; + pack_lanes(&mut restored, &raw0, &raw1); + + for i in 0..12 { + prop_assert_eq!(restored[i].0[0], original[i].0[0]); + prop_assert_eq!(restored[i].0[1], original[i].0[1]); + } + } + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs new file mode 100644 index 000000000..5ac38a28b --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs @@ -0,0 +1,217 @@ +use p3_field::extension::{ + BinomiallyExtendable, BinomiallyExtendableAlgebra, HasTwoAdicBinomialExtension, +}; +use p3_field::{PrimeCharacteristicRing, TwoAdicField, field_to_array}; + +use crate::Goldilocks; + +impl BinomiallyExtendableAlgebra for Goldilocks {} + +impl BinomiallyExtendable<2> for Goldilocks { + // Verifiable in Sage with + // `R. = GF(p)[]; assert (x^2 - 7).is_irreducible()`. + const W: Self = Self::new(7); + + // DTH_ROOT = W^((p - 1)/2). + const DTH_ROOT: Self = Self::new(18446744069414584320); + + const EXT_GENERATOR: [Self; 2] = [ + Self::new(18081566051660590251), + Self::new(16121475356294670766), + ]; +} + +impl HasTwoAdicBinomialExtension<2> for Goldilocks { + const EXT_TWO_ADICITY: usize = 33; + + fn ext_two_adic_generator(bits: usize) -> [Self; 2] { + assert!(bits <= 33); + + if bits == 33 { + [Self::ZERO, Self::new(15659105665374529263)] + } else { + [Self::two_adic_generator(bits), Self::ZERO] + } + } +} + +impl BinomiallyExtendableAlgebra for Goldilocks {} + +impl BinomiallyExtendable<3> for Goldilocks { + // Verifiable in Sage with + // `R. = GF(p)[]; assert (x^3 - 2).is_irreducible()`. + // Same irreducible as Lambda's Degree3GoldilocksExtensionField. + const W: Self = Self::new(2); + + // DTH_ROOT = primitive 3rd root of unity = 7^((p-1)/3) mod p. + const DTH_ROOT: Self = Self::new(18446744065119617025); + + // Generator of GF(p^3)* = 5 + w. Verified: passes order checks for + // all small prime factors of p^3 - 1. + const EXT_GENERATOR: [Self; 3] = [Self::new(5), Self::ONE, Self::ZERO]; +} + +impl HasTwoAdicBinomialExtension<3> for Goldilocks { + // v_2(p^3 - 1) = v_2(p-1) + v_2(p^2+p+1) = 32 + 0 = 32. + const EXT_TWO_ADICITY: usize = 32; + + fn ext_two_adic_generator(bits: usize) -> [Self; 3] { + assert!(bits <= 32); + field_to_array(Self::two_adic_generator(bits)) + } +} + +impl BinomiallyExtendableAlgebra for Goldilocks {} + +impl BinomiallyExtendable<5> for Goldilocks { + // Verifiable via: + // ```sage + // # Define Fp + // p = 2**64 - 2**32 + 1 + // F = GF(p) + + // # Define Fp[z] + // R. = PolynomialRing(F) + + // # The polynomial x^5-3 is irreducible + // assert(R(z^5-3).is_irreducible()) + // ``` + const W: Self = Self::new(3); + + // 5-th root = w^((p - 1)/5) + const DTH_ROOT: Self = Self::new(1041288259238279555); + + // Generator of the extension field + // Obtained by finding the smallest Hamming weight vector + // with appropriate order, starting at [0,1,0,0,0] + const EXT_GENERATOR: [Self; 5] = [Self::TWO, Self::ONE, Self::ZERO, Self::ZERO, Self::ZERO]; +} + +impl HasTwoAdicBinomialExtension<5> for Goldilocks { + const EXT_TWO_ADICITY: usize = 32; + + fn ext_two_adic_generator(bits: usize) -> [Self; 5] { + assert!(bits <= 32); + + field_to_array(Self::two_adic_generator(bits)) + } +} + +#[cfg(test)] +mod test_quadratic_extension { + + use num_bigint::BigUint; + use p3_field::extension::BinomialExtensionField; + use p3_field::{ExtensionField, PrimeCharacteristicRing}; + use p3_field_testing::{ + test_extension_field, test_field, test_packed_extension_field, + test_two_adic_extension_field, + }; + + use crate::Goldilocks; + + type F = Goldilocks; + type EF = BinomialExtensionField; + + // There is a redundant representation of zero but we already tested it + // when testing the base field. + const ZEROS: [EF; 1] = [EF::ZERO]; + const ONES: [EF; 1] = [EF::ONE]; + + // Get the prime factorization of the order of the multiplicative group. + // i.e. the prime factorization of P^2 - 1. + fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 9] { + [ + (BigUint::from(2u8), 33), + (BigUint::from(3u8), 1), + (BigUint::from(5u8), 1), + (BigUint::from(7u8), 1), + (BigUint::from(17u8), 1), + (BigUint::from(179u8), 1), + (BigUint::from(257u16), 1), + (BigUint::from(65537u32), 1), + (BigUint::from(7361031152998637u64), 1), + ] + } + + test_field!( + super::EF, + &super::ZEROS, + &super::ONES, + &super::multiplicative_group_prime_factorization() + ); + + test_extension_field!(super::F, super::EF); + test_two_adic_extension_field!(super::F, super::EF); + + type Pef = >::ExtensionPacking; + const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO]; + const PACKED_ONES: [Pef; 1] = [Pef::ONE]; + test_packed_extension_field!( + super::F, + super::EF, + super::Pef, + &super::PACKED_ZEROS, + &super::PACKED_ONES + ); +} + +#[cfg(test)] +mod test_quintic_extension { + + use num_bigint::BigUint; + use p3_field::extension::BinomialExtensionField; + use p3_field::{ExtensionField, PrimeCharacteristicRing}; + use p3_field_testing::{ + test_extension_field, test_field, test_packed_extension_field, + test_two_adic_extension_field, + }; + + use crate::Goldilocks; + + type F = Goldilocks; + type EF = BinomialExtensionField; + + // There is a redundant representation of zero but we already tested it + // when testing the base field. + const ZEROS: [EF; 1] = [EF::ZERO]; + const ONES: [EF; 1] = [EF::ONE]; + + // Get the prime factorization of the order of the multiplicative group. + // i.e. the prime factorization of P^5 - 1. + fn multiplicative_group_prime_factorization() -> [(num_bigint::BigUint, u32); 10] { + [ + (BigUint::from(2u8), 32), + (BigUint::from(3u8), 1), + (BigUint::from(5u8), 2), + (BigUint::from(17u8), 1), + (BigUint::from(257u16), 1), + (BigUint::from(45971u16), 1), + (BigUint::from(65537u32), 1), + (BigUint::from(255006435240067831u64), 1), + (BigUint::from(280083648770327405561u128), 1), + (BigUint::from(7053197395277272939628824863222181u128), 1), + ] + } + + test_field!( + super::EF, + &super::ZEROS, + &super::ONES, + &super::multiplicative_group_prime_factorization() + ); + + test_extension_field!(super::F, super::EF); + test_two_adic_extension_field!(super::F, super::EF); + + type Pef = >::ExtensionPacking; + const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO]; + const PACKED_ONES: [Pef; 1] = [Pef::ONE]; + test_packed_extension_field!( + super::F, + super::EF, + super::Pef, + &super::PACKED_ZEROS, + &super::PACKED_ONES + ); +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs new file mode 100644 index 000000000..ebe3f8c7a --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs @@ -0,0 +1,813 @@ +use alloc::vec; +use alloc::vec::Vec; +use core::fmt::{Debug, Display, Formatter}; +use core::hash::{Hash, Hasher}; +use core::iter::{Product, Sum}; +use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; +use core::{array, fmt}; + +use num_bigint::BigUint; +use p3_challenger::UniformSamplingField; +use p3_field::exponentiation::exp_10540996611094048183; +use p3_field::integers::QuotientMap; +use p3_field::op_assign_macros::{ + impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign, +}; +use p3_field::{ + Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField, + PrimeField64, RawDataSerializable, TwoAdicField, halve_u64, impl_raw_serializable_primefield64, + quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int, +}; +use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner}; +use rand::Rng; +use rand::distr::{Distribution, StandardUniform}; +use serde::{Deserialize, Serialize}; + +/// The Goldilocks prime +pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001; + +/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`. +/// +/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64. +#[derive(Copy, Clone, Default, Serialize, Deserialize)] +#[repr(transparent)] // Important for reasoning about memory layout +#[must_use] +pub struct Goldilocks { + /// Not necessarily canonical. + pub(crate) value: u64, +} + +impl Goldilocks { + /// Create a new field element from any `u64`. + /// + /// Any `u64` value is accepted. No reduction is performed since + /// Goldilocks uses a non-canonical internal representation. + #[inline] + pub const fn new(value: u64) -> Self { + Self { value } + } + + /// Convert a `[u64; N]` array to an array of field elements. + /// + /// Const version of `input.map(Goldilocks::new)`. + #[inline] + pub const fn new_array(input: [u64; N]) -> [Self; N] { + let mut output = [Self::ZERO; N]; + let mut i = 0; + while i < N { + output[i].value = input[i]; + i += 1; + } + output + } + + /// Convert a `[[u64; N]; M]` array to a 2D array of field elements. + /// + /// Const version of `input.map(Goldilocks::new_array)`. + #[inline] + pub const fn new_2d_array( + input: [[u64; N]; M], + ) -> [[Self; N]; M] { + let mut output = [[Self::ZERO; N]; M]; + let mut i = 0; + while i < M { + output[i] = Self::new_array(input[i]); + i += 1; + } + output + } + + /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`. + const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg(); + + /// A list of generators for the two-adic subgroups of the goldilocks field. + /// + /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`. + pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([ + 0x0000000000000001, + 0xffffffff00000000, + 0x0001000000000000, + 0xfffffffeff000001, + 0xefffffff00000001, + 0x00003fffffffc000, + 0x0000008000000000, + 0xf80007ff08000001, + 0xbf79143ce60ca966, + 0x1905d02a5c411f4e, + 0x9d8f2ad78bfed972, + 0x0653b4801da1c8cf, + 0xf2c35199959dfcb6, + 0x1544ef2335d17997, + 0xe0ee099310bba1e2, + 0xf6b2cffe2306baac, + 0x54df9630bf79450e, + 0xabd0a6e8aa3d8a0e, + 0x81281a7b05f9beac, + 0xfbd41c6b8caa3302, + 0x30ba2ecd5e93e76d, + 0xf502aef532322654, + 0x4b2a18ade67246b5, + 0xea9d5a1336fbc98b, + 0x86cdcc31c307e171, + 0x4bbaf5976ecfefd8, + 0xed41d05b78d6e286, + 0x10d78dd8915a171d, + 0x59049500004a4485, + 0xdfa8c93ba46d2666, + 0x7e9bd009b86a0845, + 0x400a7f755588e659, + 0x185629dcda58878c, + ]); + + /// A list of powers of two from 0 to 95. + /// + /// Note that 2^{96} = -1 mod P so all powers of two can be simply + /// derived from this list. + const POWERS_OF_TWO: [Self; 96] = { + let mut powers_of_two = [Self::ONE; 96]; + + let mut i = 1; + while i < 64 { + powers_of_two[i] = Self::new(1 << i); + i += 1; + } + let mut var = Self::new(1 << 63); + while i < 96 { + var = const_add(var, var); + powers_of_two[i] = var; + i += 1; + } + powers_of_two + }; +} + +impl PartialEq for Goldilocks { + fn eq(&self, other: &Self) -> bool { + self.as_canonical_u64() == other.as_canonical_u64() + } +} + +impl Eq for Goldilocks {} + +impl Packable for Goldilocks {} + +impl Hash for Goldilocks { + fn hash(&self, state: &mut H) { + state.write_u64(self.as_canonical_u64()); + } +} + +impl Ord for Goldilocks { + fn cmp(&self, other: &Self) -> core::cmp::Ordering { + self.as_canonical_u64().cmp(&other.as_canonical_u64()) + } +} + +impl PartialOrd for Goldilocks { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Display for Goldilocks { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(&self.as_canonical_u64(), f) + } +} + +impl Debug for Goldilocks { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Debug::fmt(&self.as_canonical_u64(), f) + } +} + +impl Distribution for StandardUniform { + fn sample(&self, rng: &mut R) -> Goldilocks { + loop { + let next_u64 = rng.next_u64(); + let is_canonical = next_u64 < Goldilocks::ORDER_U64; + if is_canonical { + return Goldilocks::new(next_u64); + } + } + } +} + +impl UniformSamplingField for Goldilocks { + const MAX_SINGLE_SAMPLE_BITS: usize = 24; + const SAMPLING_BITS_M: [u64; 64] = { + let prime: u64 = P; + let mut a = [0u64; 64]; + let mut k = 0; + while k < 64 { + if k == 0 { + a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always. + } else { + // Create a mask to zero out the last k bits + let mask = !((1u64 << k) - 1); + a[k] = prime & mask; + } + k += 1; + } + a + }; +} + +impl PrimeCharacteristicRing for Goldilocks { + type PrimeSubfield = Self; + + const ZERO: Self = Self::new(0); + const ONE: Self = Self::new(1); + const TWO: Self = Self::new(2); + const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1); + + #[inline] + fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { + f + } + + #[inline] + fn from_bool(b: bool) -> Self { + Self::new(b.into()) + } + + #[inline] + fn halve(&self) -> Self { + Self::new(halve_u64::

(self.value)) + } + + #[inline] + fn mul_2exp_u64(&self, exp: u64) -> Self { + // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P. + if exp < 96 { + *self * Self::POWERS_OF_TWO[exp as usize] + } else if exp < 192 { + -*self * Self::POWERS_OF_TWO[(exp - 96) as usize] + } else { + self.mul_2exp_u64(exp % 192) + } + } + + #[inline] + fn div_2exp_u64(&self, mut exp: u64) -> Self { + // In the goldilocks field, 2^192 = 1 mod P. + // Thus 2^{-n} = 2^{192 - n} mod P. + exp %= 192; + self.mul_2exp_u64(192 - exp) + } + + #[inline] + fn sum_array(input: &[Self]) -> Self { + assert_eq!(N, input.len()); + // Benchmarking shows that for N <= 3 it's faster to sum the elements directly + // but for N > 3 it's faster to use the .sum() methods which passes through u128's + // allowing for delayed reductions. + match N { + 0 => Self::ZERO, + 1 => input[0], + 2 => input[0] + input[1], + 3 => input[0] + input[1] + input[2], + _ => input.iter().copied().sum(), + } + } + + #[inline] + fn dot_product(lhs: &[Self; N], rhs: &[Self; N]) -> Self { + // The constant OFFSET has 2 important properties: + // 1. It is a multiple of P. + // 2. It is greater than the maximum possible value of the sum of the products of two u64s. + const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32); + assert!((N as u32) <= (1 << 31)); + match N { + 0 => Self::ZERO, + 1 => lhs[0] * rhs[0], + 2 => { + // We unroll the N = 2 case as it is slightly faster and this is an important case + // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension. + let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128); + let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128); + + // We know that long_prod_0, long_prod_1 < OFFSET. + // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET. + let (sum, over) = long_prod_0.overflowing_add(long_prod_1); + // Compiler really likes defining sum_corr here instead of in the if/else. + let sum_corr = sum.wrapping_sub(OFFSET); + if over { + reduce128(sum_corr) + } else { + reduce128(sum) + } + } + _ => { + let (lo_plus_hi, hi) = lhs + .iter() + .zip(rhs) + .map(|(x, y)| (x.value as u128) * (y.value as u128)) + .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| { + // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits. + let val_hi = (val >> 96) as u64; + // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi. + // As N <= 2^32, acc_hi cannot overflow. + unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) } + }); + // First, remove the hi part from lo_plus_hi. + let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96); + // As 2^{96} = -1 mod P, we simply need to reduce lo - hi. + // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow. + let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) }; + reduce128(sum) + } + } + } + + #[inline] + fn zero_vec(len: usize) -> Vec { + // SAFETY: + // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment + // and memory layout making `flatten_to_base` safe. This this will create + // a vector Goldilocks elements with value set to 0. + unsafe { flatten_to_base(vec![0u64; len]) } + } +} + +/// Degree of the smallest permutation polynomial for Goldilocks. +/// +/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7. +impl InjectiveMonomial<7> for Goldilocks {} + +impl PermutationMonomial<7> for Goldilocks { + /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}. + /// + /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`. + fn injective_exp_root_n(&self) -> Self { + exp_10540996611094048183(*self) + } +} + +impl RawDataSerializable for Goldilocks { + impl_raw_serializable_primefield64!(); +} + +impl Field for Goldilocks { + #[cfg(all( + target_arch = "x86_64", + target_feature = "avx2", + not(target_feature = "avx512f") + ))] + type Packing = crate::PackedGoldilocksAVX2; + + #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] + type Packing = crate::PackedGoldilocksAVX512; + + // PATCHED for bench_vs_plonky3: disable NEON packing for apples-to-apples + // scalar comparison against Lambda STARK. Upstream: `crate::PackedGoldilocksNeon`. + #[cfg(target_arch = "aarch64")] + type Packing = Self; + + #[cfg(not(any( + all( + target_arch = "x86_64", + target_feature = "avx2", + not(target_feature = "avx512f") + ), + all(target_arch = "x86_64", target_feature = "avx512f"), + target_arch = "aarch64", + )))] + type Packing = Self; + + // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator() + const GENERATOR: Self = Self::new(7); + + fn is_zero(&self) -> bool { + self.value == 0 || self.value == Self::ORDER_U64 + } + + fn try_inverse(&self) -> Option { + if self.is_zero() { + return None; + } + + Some(gcd_inversion(*self)) + } + + #[inline] + fn order() -> BigUint { + P.into() + } +} + +// We use macros to implement QuotientMap for all integer types except for u64 and i64. +quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]); +quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]); +quotient_map_large_uint!( + Goldilocks, + u64, + Goldilocks::ORDER_U64, + "`[0, 2^64 - 2^32]`", + "`[0, 2^64 - 1]`", + [u128] +); +quotient_map_large_iint!( + Goldilocks, + i64, + "`[-(2^63 - 2^31), 2^63 - 2^31]`", + "`[1 + 2^32 - 2^64, 2^64 - 1]`", + [(i128, u128)] +); + +impl QuotientMap for Goldilocks { + /// Convert a given `u64` integer into an element of the `Goldilocks` field. + /// + /// No reduction is needed as the internal value is allowed + /// to be any u64. + #[inline] + fn from_int(int: u64) -> Self { + Self::new(int) + } + + /// Convert a given `u64` integer into an element of the `Goldilocks` field. + /// + /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`. + #[inline] + fn from_canonical_checked(int: u64) -> Option { + (int < Self::ORDER_U64).then(|| Self::new(int)) + } + + /// Convert a given `u64` integer into an element of the `Goldilocks` field. + /// + /// # Safety + /// In this case this function is actually always safe as the internal + /// value is allowed to be any u64. + #[inline(always)] + unsafe fn from_canonical_unchecked(int: u64) -> Self { + Self::new(int) + } +} + +impl QuotientMap for Goldilocks { + /// Convert a given `i64` integer into an element of the `Goldilocks` field. + /// + /// We simply need to deal with the sign. + #[inline] + fn from_int(int: i64) -> Self { + if int >= 0 { + Self::new(int as u64) + } else { + Self::new(Self::ORDER_U64.wrapping_add_signed(int)) + } + } + + /// Convert a given `i64` integer into an element of the `Goldilocks` field. + /// + /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`. + #[inline] + fn from_canonical_checked(int: i64) -> Option { + const POS_BOUND: i64 = (P >> 1) as i64; + const NEG_BOUND: i64 = -POS_BOUND; + match int { + 0..=POS_BOUND => Some(Self::new(int as u64)), + NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))), + _ => None, + } + } + + /// Convert a given `i64` integer into an element of the `Goldilocks` field. + /// + /// # Safety + /// In this case this function is actually always safe as the internal + /// value is allowed to be any u64. + #[inline(always)] + unsafe fn from_canonical_unchecked(int: i64) -> Self { + Self::from_int(int) + } +} + +impl PrimeField for Goldilocks { + fn as_canonical_biguint(&self) -> BigUint { + self.as_canonical_u64().into() + } +} + +impl PrimeField64 for Goldilocks { + const ORDER_U64: u64 = P; + + #[inline] + fn as_canonical_u64(&self) -> u64 { + let mut c = self.value; + // We only need one condition subtraction, since 2 * ORDER would not fit in a u64. + if c >= Self::ORDER_U64 { + c -= Self::ORDER_U64; + } + c + } +} + +impl TwoAdicField for Goldilocks { + const TWO_ADICITY: usize = 32; + + fn two_adic_generator(bits: usize) -> Self { + assert!(bits <= Self::TWO_ADICITY); + Self::TWO_ADIC_GENERATORS[bits] + } +} + +/// A const version of the addition function. +/// +/// Useful for constructing constants values in const contexts. Outside of +/// const contexts, Add should be used instead. +#[inline] +const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks { + let (sum, over) = lhs.value.overflowing_add(rhs.value); + let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER); + if over { + sum += Goldilocks::NEG_ORDER; + } + Goldilocks::new(sum) +} + +impl Add for Goldilocks { + type Output = Self; + + #[inline] + fn add(self, rhs: Self) -> Self { + let (sum, over) = self.value.overflowing_add(rhs.value); + let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER); + if over { + // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not + // sufficient for double-overflow. + // This assume does two things: + // 1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip + // this check. + // 2. Hints to the compiler how rare this double-overflow is (thus handled better with + // a branch). + unsafe { + assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64); + } + branch_hint(); + sum += Self::NEG_ORDER; // Cannot overflow. + } + Self::new(sum) + } +} + +impl Sub for Goldilocks { + type Output = Self; + + #[inline] + fn sub(self, rhs: Self) -> Self { + let (diff, under) = self.value.overflowing_sub(rhs.value); + let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER); + if under { + // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not + // sufficient for double-underflow. + // This assume does two things: + // 1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER, + // then it can skip this check. + // 2. Hints to the compiler how rare this double-underflow is (thus handled better + // with a branch). + unsafe { + assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64); + } + branch_hint(); + diff -= Self::NEG_ORDER; // Cannot underflow. + } + Self::new(diff) + } +} + +impl Neg for Goldilocks { + type Output = Self; + + #[inline] + fn neg(self) -> Self::Output { + Self::new(Self::ORDER_U64 - self.as_canonical_u64()) + } +} + +impl Mul for Goldilocks { + type Output = Self; + + #[inline] + fn mul(self, rhs: Self) -> Self { + reduce128(u128::from(self.value) * u128::from(rhs.value)) + } +} + +impl_add_assign!(Goldilocks); +impl_sub_assign!(Goldilocks); +impl_mul_methods!(Goldilocks); +impl_div_methods!(Goldilocks, Goldilocks); + +impl Sum for Goldilocks { + fn sum>(iter: I) -> Self { + // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2. + + // This sum will not overflow so long as iter.len() < 2^64. + let sum = iter.map(|x| x.value as u128).sum::(); + reduce128(sum) + } +} + +/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the +/// field order and `2^64`. +#[inline] +pub(crate) fn reduce128(x: u128) -> Goldilocks { + let (x_lo, x_hi) = split(x); // This is a no-op + let x_hi_hi = x_hi >> 32; + let x_hi_lo = x_hi & Goldilocks::NEG_ORDER; + + let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi); + if borrow { + branch_hint(); // A borrow is exceedingly rare. It is faster to branch. + t0 -= Goldilocks::NEG_ORDER; // Cannot underflow. + } + let t1 = x_hi_lo * Goldilocks::NEG_ORDER; + let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) }; + Goldilocks::new(t2) +} + +#[inline] +#[allow(clippy::cast_possible_truncation)] +const fn split(x: u128) -> (u64, u64) { + (x as u64, (x >> 64) as u64) +} + +/// Fast addition modulo ORDER for x86-64. +/// This function is marked unsafe for the following reasons: +/// - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001. +/// - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in +/// the registers, so its use is not recommended when either input will be used again. +#[inline(always)] +#[cfg(target_arch = "x86_64")] +unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 { + unsafe { + let res_wrapped: u64; + let adjustment: u64; + core::arch::asm!( + "add {0}, {1}", + // Trick. The carry flag is set iff the addition overflowed. + // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does + // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of + // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1} + // on overflow; this happens be NEG_ORDER. + // Note that the CPU does not realize that the result of sbb x, x does not actually depend + // on x. We must write the result to a register that we know to be ready. We have a + // dependency on {1} anyway, so let's use it. + "sbb {1:e}, {1:e}", + inlateout(reg) x => res_wrapped, + inlateout(reg) y => adjustment, + options(pure, nomem, nostack), + ); + assume(x != 0 || (res_wrapped == y && adjustment == 0)); + assume(y != 0 || (res_wrapped == x && adjustment == 0)); + // Add NEG_ORDER == subtract ORDER. + // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect. + res_wrapped + adjustment + } +} + +#[inline(always)] +#[cfg(not(target_arch = "x86_64"))] +unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 { + let (res_wrapped, carry) = x.overflowing_add(y); + // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect. + res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry) +} + +/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm. +/// +/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf. +/// The key idea is to compute update factors which are incorrect by a known power of 2 which +/// can be corrected at the end. These update factors can then be used to construct the inverse +/// via a simple linear combination. +/// +/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic. +fn gcd_inversion(input: Goldilocks) -> Goldilocks { + // Initialise our values to the value we want to invert and the prime. + let (mut a, mut b) = (input.value, P); + + // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`. + // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`. + // We split the iterations into 2 rounds of length 63. + const ROUND_SIZE: usize = 63; + + // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted + // version which doesn't do any computations involving g. But either the compiler works this out + // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks. + let (f00, _, f10, _) = gcd_inner::(&mut a, &mut b); + let (_, _, f11, g11) = gcd_inner::(&mut a, &mut b); + + // The update factors are i64's except we need to interpret -2^63 as 2^63. + // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`. + let u = from_unusual_int(f00); + let v = from_unusual_int(f10); + let u_fac11 = from_unusual_int(f11); + let v_fac11 = from_unusual_int(g11); + + // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}. + // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66. + (u * u_fac11 + v * v_fac11).mul_2exp_u64(66) +} + +/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63. +const fn from_unusual_int(int: i64) -> Goldilocks { + if (int >= 0) || (int == i64::MIN) { + Goldilocks::new(int as u64) + } else { + Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int)) + } +} + +#[cfg(test)] +mod tests { + use p3_field::extension::BinomialExtensionField; + use p3_field_testing::{ + test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field, + }; + + use super::*; + + type F = Goldilocks; + type EF = BinomialExtensionField; + + #[test] + fn test_goldilocks() { + let f = F::new(100); + assert_eq!(f.as_canonical_u64(), 100); + + // Over the Goldilocks field, the following set of equations hold + // p = 0 + // 2^64 - 2^32 + 1 = 0 + // 2^64 = 2^32 - 1 + let f = F::new(u64::MAX); + assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1); + + let f = F::from_u64(u64::MAX); + assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1); + + // Generator check + let expected_multiplicative_group_generator = F::new(7); + assert_eq!(F::GENERATOR, expected_multiplicative_group_generator); + assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64); + + // Check on `reduce_u128` + let x = u128::MAX; + let y = reduce128(x); + // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1 + // 2^128 - 1 = (2^64 - 1) * (2^64 + 1) + // = (2^32 - 1 - 1) * (2^32 - 1 + 1) + // = (2^32 - 2) * (2^32) + // = 2^64 - 2 * 2^32 + // = 2^64 - 2^33 + // = 2^32 - 1 - 2^33 + // = - 2^32 - 1 + let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE; + assert_eq!(y, expected_result); + + let f = F::new(100); + assert_eq!(f.injective_exp_n().injective_exp_root_n(), f); + assert_eq!(y.injective_exp_n().injective_exp_root_n(), y); + assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO); + } + + // Goldilocks has a redundant representation for both 0 and 1. + const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)]; + const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)]; + + // Get the prime factorization of the order of the multiplicative group. + // i.e. the prime factorization of P - 1. + fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] { + [ + (BigUint::from(2u8), 32), + (BigUint::from(3u8), 1), + (BigUint::from(5u8), 1), + (BigUint::from(17u8), 1), + (BigUint::from(257u16), 1), + (BigUint::from(65537u32), 1), + ] + } + + test_field!( + crate::Goldilocks, + &super::ZEROS, + &super::ONES, + &super::multiplicative_group_prime_factorization() + ); + test_prime_field!(crate::Goldilocks); + test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES); + test_two_adic_field!(crate::Goldilocks); + + test_field_dft!( + radix2dit, + crate::Goldilocks, + super::EF, + p3_dft::Radix2Dit<_> + ); + test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers); + test_field_dft!( + parallel, + crate::Goldilocks, + super::EF, + p3_dft::Radix2DitParallel + ); +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs new file mode 100644 index 000000000..9447fe094 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs @@ -0,0 +1,42 @@ +//! The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`. + +#![no_std] + +extern crate alloc; + +mod extension; +mod goldilocks; +mod mds; +mod poseidon2; + +pub use goldilocks::*; +pub use mds::*; +pub use poseidon2::*; + +pub mod poseidon1; + +#[cfg(target_arch = "aarch64")] +mod aarch64_neon; + +#[cfg(target_arch = "aarch64")] +pub use aarch64_neon::*; + +#[cfg(all( + target_arch = "x86_64", + target_feature = "avx2", + not(target_feature = "avx512f") +))] +mod x86_64_avx2; + +#[cfg(all( + target_arch = "x86_64", + target_feature = "avx2", + not(target_feature = "avx512f") +))] +pub use x86_64_avx2::*; + +#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] +mod x86_64_avx512; + +#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] +pub use x86_64_avx512::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs new file mode 100644 index 000000000..df41485b3 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs @@ -0,0 +1,761 @@ +//! MDS matrices over the Goldilocks field, and permutations defined by them. +//! +//! NB: Not all sizes have fast implementations of their permutations. +//! Supported sizes: 8, 12, 16, 24, 32, 64, 68. +//! Sizes 8 and 12 are from Plonky2, size 16 was found as part of concurrent +//! work by Angus Gruen and Hamish Ivey-Law. Other sizes are from Ulrich Haböck's +//! database. + +use p3_dft::Radix2Bowers; +use p3_mds::MdsPermutation; +use p3_mds::karatsuba_convolution::Convolve; +use p3_mds::util::{apply_circulant, apply_circulant_fft, first_row_to_first_col}; +use p3_symmetric::Permutation; + +use crate::{Goldilocks, reduce128}; + +#[derive(Clone, Debug, Default)] +pub struct MdsMatrixGoldilocks; + +/// Instantiate convolution for "small" RHS vectors over Goldilocks. +/// +/// Here "small" means N = len(rhs) <= 16 and sum(r for r in rhs) < +/// 2^51, though in practice the sum will be less than 2^9. +#[derive(Debug)] +pub struct SmallConvolveGoldilocks; +impl Convolve for SmallConvolveGoldilocks { + const T_ZERO: i128 = 0; + const U_ZERO: i64 = 0; + + #[inline(always)] + fn halve(val: i128) -> i128 { + val >> 1 + } + + /// Return the lift of a Goldilocks element, 0 <= input.value <= P + /// < 2^64. We widen immediately, since some valid Goldilocks elements + /// don't fit in an i64, and since in any case overflow can occur + /// for even the smallest convolutions. + #[inline(always)] + fn read(input: Goldilocks) -> i128 { + input.value as i128 + } + + /// For a convolution of size N, |x| < N * 2^64 and (as per the + /// assumption above), |y| < 2^51. So the product is at most N * + /// 2^115 which will not overflow for N <= 16. We widen `y` at + /// this point to perform the multiplication. + #[inline(always)] + fn parity_dot(u: [i128; N], v: [i64; N]) -> i128 { + let mut s = 0i128; + for i in 0..N { + s += u[i] * v[i] as i128; + } + s + } + + /// The assumptions above mean z < N^2 * 2^115, which is at most + /// 2^123 when N <= 16. + /// + /// NB: Even though intermediate values could be negative, the + /// output must be non-negative since the inputs were + /// non-negative. + #[inline(always)] + fn reduce(z: i128) -> Goldilocks { + debug_assert!(z >= 0); + reduce128(z as u128) + } +} + +const FFT_ALGO: Radix2Bowers = Radix2Bowers; + +pub(crate) const MATRIX_CIRC_MDS_8_SML_ROW: [i64; 8] = [7, 1, 3, 8, 8, 3, 4, 9]; + +/// First column of the circulant MDS matrix for width 8, derived from the first row. +pub const MATRIX_CIRC_MDS_8_COL: [i64; 8] = first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW); + +impl Permutation<[Goldilocks; 8]> for MdsMatrixGoldilocks { + fn permute(&self, input: [Goldilocks; 8]) -> [Goldilocks; 8] { + const MATRIX_CIRC_MDS_8_SML_COL: [i64; 8] = + first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW); + SmallConvolveGoldilocks::apply( + input, + MATRIX_CIRC_MDS_8_SML_COL, + SmallConvolveGoldilocks::conv8, + ) + } +} +impl MdsPermutation for MdsMatrixGoldilocks {} + +pub(crate) const MATRIX_CIRC_MDS_12_SML_ROW: [i64; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]; + +/// First column of the circulant MDS matrix for width 12, derived from the first row. +pub const MATRIX_CIRC_MDS_12_COL: [i64; 12] = first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW); + +impl Permutation<[Goldilocks; 12]> for MdsMatrixGoldilocks { + fn permute(&self, input: [Goldilocks; 12]) -> [Goldilocks; 12] { + const MATRIX_CIRC_MDS_12_SML_COL: [i64; 12] = + first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW); + SmallConvolveGoldilocks::apply( + input, + MATRIX_CIRC_MDS_12_SML_COL, + SmallConvolveGoldilocks::conv12, + ) + } +} +impl MdsPermutation for MdsMatrixGoldilocks {} + +pub(crate) const MATRIX_CIRC_MDS_16_SML_ROW: [i64; 16] = + [1, 1, 51, 1, 11, 17, 2, 1, 101, 63, 15, 2, 67, 22, 13, 3]; + +impl Permutation<[Goldilocks; 16]> for MdsMatrixGoldilocks { + fn permute(&self, input: [Goldilocks; 16]) -> [Goldilocks; 16] { + const MATRIX_CIRC_MDS_16_SML_COL: [i64; 16] = + first_row_to_first_col(&MATRIX_CIRC_MDS_16_SML_ROW); + SmallConvolveGoldilocks::apply( + input, + MATRIX_CIRC_MDS_16_SML_COL, + SmallConvolveGoldilocks::conv16, + ) + } +} +impl MdsPermutation for MdsMatrixGoldilocks {} + +#[rustfmt::skip] +pub(crate) const MATRIX_CIRC_MDS_24_GOLDILOCKS: [u64; 24] = [ + 0x5FFFFFFFA00AAAAB, 0x24021AB75BBFE656, 0x7BE9082D73B06DF5, 0x2282863E9C3A5A62, + 0xE0071C70DFFC71C8, 0x796CB65AB42A1A63, 0xDBBBBFFADFFDDDE3, 0x23B88EE217C5C9C2, + 0x20030C309FFB6DB7, 0x23C3C64763BE1E1D, 0x0F93B7C9CC51362E, 0xC697A1094BD0850A, + 0xDFFFFFFF1FFC71C8, 0xC15A4FD614950302, 0xC41D883A4C4DEDF2, 0x187879BC23C46462, + 0x5FFCF3CEDFFE79E8, 0x1C41DF105B82398E, 0x64444003DFFDDDDA, 0x76EDDBB6F7E51F95, + 0x1FF8E38E20038E39, 0x214139BD5C40A09D, 0x3065B7CCF3B3B621, 0x23B6F4622485CEDC, +]; + +impl Permutation<[Goldilocks; 24]> for MdsMatrixGoldilocks { + fn permute(&self, input: [Goldilocks; 24]) -> [Goldilocks; 24] { + apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input) + } +} +impl MdsPermutation for MdsMatrixGoldilocks {} + +#[rustfmt::skip] +const MATRIX_CIRC_MDS_32_GOLDILOCKS: [u64; 32] = [ + 0x0800000000000000, 0x69249248B4924925, 0x3ABD5EAF15EAF57B, 0x294A5294739CE73A, + 0x59E2D2CEB4B3C5A6, 0x087FBE00FF7C0220, 0xA554AA94A554AA96, 0xF00080FEFFDF8005, + 0x64CCCCCC6666699A, 0x5B13AD8973B139D9, 0xAD4A55ACA54AD5AA, 0xDA496DA3B492DB8A, + 0x4AD696955A5694B5, 0xA4A6B29A25B496D3, 0xA74EA162162BD3A9, 0xC698B3A5662CE98C, + 0xA7FFFFFF55555556, 0x4AAAAAAA5AAAAAAB, 0xB047DC113DC11F71, 0x8BA2E8B99B26C9B3, + 0xD259696C5A5B4D2E, 0xA7D540AA557EA9F6, 0x8B6E922D26DB249C, 0xFAAA805455602AAD, + 0xCB33333266666334, 0xD13B17619B13B277, 0x45B26D9326E9374A, 0x52AB552A5AA9556B, + 0x68ED2D2DB4B87697, 0x8B264C98A74E9D3B, 0x09EC23D83D847B09, 0x2C9A4D26669349A5, +]; + +impl Permutation<[Goldilocks; 32]> for MdsMatrixGoldilocks { + fn permute(&self, input: [Goldilocks; 32]) -> [Goldilocks; 32] { + const ENTRIES: [u64; 32] = first_row_to_first_col(&MATRIX_CIRC_MDS_32_GOLDILOCKS); + apply_circulant_fft(&FFT_ALGO, ENTRIES, &input) + } +} +impl MdsPermutation for MdsMatrixGoldilocks {} + +#[rustfmt::skip] +const MATRIX_CIRC_MDS_64_GOLDILOCKS: [u64; 64] = [ + 0x07FFFFFFFC000000, 0xFBFFFFFF04000001, 0x436DB6DB25B6DB6E, 0x4AAAAAAA5AAAAAAB, + 0x45B2D96C6D96CB66, 0x3BC7BC7B87BC7BC8, 0x6318C63125294A53, 0xCB3672CCCD9CB368, + 0xB43CB5A12D68796C, 0xFBFBFBFAFBFBFBFD, 0x883DBF107B7E2210, 0x8A7689B59B629DA3, + 0xF7FEFFDF00000001, 0x7B7C83BBC83BC47C, 0xEFF0410107EF7F83, 0x2CD8B3629CB272CA, + 0x9800019900CCCE67, 0xFBFFFBFF07FFFC01, 0x94EC4A758C4EC628, 0xDA5A5B4A6D2D2E1F, + 0xFFEFC080FC003FFF, 0xBC387BC2C783BC79, 0xB492DB686D24B6F3, 0x1DB6925B4B6E2477, + 0x7801E0EF87BFFF10, 0xFC0803FAFBFC0409, 0x3780FE03C086F21C, 0x8B749B224DB22D94, + 0x32648B36B76E9923, 0x3BC3C3C387C3C3C4, 0x79AF286B4FCA1AF3, 0x9E2762758B627628, + 0x52AAAAAA56AAAAAB, 0xFBFFFFFEFC000001, 0xF7FFFFFF08000001, 0x2CCCCCCC9CCCCCCD, + 0xCF286BC946BCA1B0, 0xBC483B7B883B7C49, 0xD9364D9287C1F07D, 0xAD5A94A8A95AD5AA, + 0xFF871002C400F1E1, 0xFC03FC02FC03FC05, 0xD29495A4D6D4B4A6, 0x6C926DD1DD24DB65, + 0x1EDC247B4DB64937, 0x7C7B843B47BC437D, 0xA55A95AAAD5AD52C, 0x4A96D5A45AD694A6, + 0xFE6664CBCD999801, 0xFC0003FF08000401, 0x1EC4F09D64EC4D8A, 0x9E1E1D2C8B4B4A5B, + 0xD9270937709B64DC, 0x3BB77C4448843B78, 0xFFFFFFDF03FF0021, 0x59D8761D2D8A6299, + 0xC3496878A5E5A4B5, 0xFBF80402FC0403F9, 0x5ECD9B360E142851, 0x6D925D6429D64976, + 0xA8AE615C19CC2B99, 0xBC44444388444445, 0xDFE3F1F81CFC7E40, 0xDA4924916D24924A, +]; + +impl Permutation<[Goldilocks; 64]> for MdsMatrixGoldilocks { + fn permute(&self, input: [Goldilocks; 64]) -> [Goldilocks; 64] { + const ENTRIES: [u64; 64] = first_row_to_first_col(&MATRIX_CIRC_MDS_64_GOLDILOCKS); + apply_circulant_fft(&FFT_ALGO, ENTRIES, &input) + } +} +impl MdsPermutation for MdsMatrixGoldilocks {} + +#[rustfmt::skip] +const MATRIX_CIRC_MDS_68_GOLDILOCKS: [u64; 68] = [ + 0x03C3C3C3FC3C3C3C, 0x6799AFC54A69BC7D, 0xDA8C2C496A74B03B, 0x1E641D7AB35ED229, + 0x9239DA20DA3A2686, 0x6E23D41459EBA8C4, 0x7BC412896E2A6B3A, 0x9082059089ABD4FC, + 0x94A16FA8B0339EEE, 0x85650EC91BB519C9, 0x1600745267E94DE1, 0xFFFD8405C82020AB, + 0x21BDE80429DCED6A, 0x8ACE123AF754E343, 0xFFC7211605D2BDAE, 0xC21187AE15900F4D, + 0x9C4A889708568DC6, 0x65A5A726B5758D8E, 0x949DB90B9AC0D11A, 0x23B6CF7C368BBE52, + 0xD5128DDF59CB5A35, 0xF53BCC5BDADF3A0A, 0xBA7C5112F4BAB1CD, 0x4B93989C5B729351, + 0x6534B7E50E4AD1CB, 0x640061B54C918405, 0x0E66E1F90D2C9311, 0x31C8649B0FE7557F, + 0x0E9190D165F4A8F3, 0x52DF336BB708F919, 0x3C0F6697F14065A5, 0xBE8190942EC50031, + 0x60038E9ACC701118, 0x73F105909A55A88B, 0xFEBEBEBDABEBEBED, 0x6F52163A64B03467, + 0xFBAE131F23A12F56, 0x1950493BC70D0676, 0x2886550DB5A1BBBF, 0x15B003D6E58181D7, + 0x3A4E7D9D44F100F8, 0x6CC3AB896025E6A0, 0x7E23E68456F825E5, 0x079CDD570B591A16, + 0xEC15A830C3D2CCD1, 0xCF4C722D2C0F8A0E, 0xC1BB6F5591B59A26, 0xB63A5931A607BDE0, + 0x43A0AD0B71040187, 0x7E4B492889D1CEE0, 0x734153F3F0C31C5B, 0x98D8D756B2725A5B, + 0x5589D20D74BA00B8, 0xB2DF58DF0A312509, 0xFABC378690D64A3A, 0x700640AFC244B695, + 0xFFA652236547F3BE, 0x2B9CA498A001D059, 0x7DACA6F16787D5DE, 0xAAAD774FAC613EA3, + 0xA88583816975CD56, 0x78B71DC516FF49CA, 0xC7BF095DF702FFA6, 0x78A60B3F971783B3, + 0xCB158EF40BC75CAC, 0xA97E818DBC152B4C, 0x9FC8339D415C3999, 0x006A88C0A0D8201C, +]; + +impl Permutation<[Goldilocks; 68]> for MdsMatrixGoldilocks { + fn permute(&self, input: [Goldilocks; 68]) -> [Goldilocks; 68] { + apply_circulant(&MATRIX_CIRC_MDS_68_GOLDILOCKS, &input) + } +} +impl MdsPermutation for MdsMatrixGoldilocks {} + +#[cfg(test)] +mod tests { + use p3_symmetric::Permutation; + + use super::{Goldilocks, MdsMatrixGoldilocks}; + + #[test] + fn goldilocks8() { + let input: [Goldilocks; 8] = Goldilocks::new_array([ + 2434589605738284713, + 4817685620989478889, + 13397079175138649456, + 11944520631108649751, + 1033251468644039632, + 3092099742268329866, + 7160548811622790454, + 9959569614427134344, + ]); + + let output = MdsMatrixGoldilocks.permute(input); + + let expected: [Goldilocks; 8] = Goldilocks::new_array([ + 16726687146516531007, + 14721040752765534861, + 15566838577475948790, + 9095485010737904250, + 11353934351835864222, + 11056556168691087893, + 4199602889124860181, + 315643510993921470, + ]); + + assert_eq!(output, expected); + } + + #[test] + fn goldilocks12() { + let input: [Goldilocks; 12] = Goldilocks::new_array([ + 14847187883725400244, + 969392934980971521, + 6996647758016470432, + 4674844440624672154, + 264841656685969785, + 1246852265697711623, + 18223868478428473484, + 12122736699239070772, + 11263701854732819430, + 12739925508864285577, + 11648637570857932167, + 14090978315217600393, + ]); + + let output = MdsMatrixGoldilocks.permute(input); + + let expected: [Goldilocks; 12] = Goldilocks::new_array([ + 9322351889214742299, + 8700136572060418355, + 4881757876459003977, + 9899544690241851021, + 480548822895830465, + 5445915149371405525, + 14955363277757168581, + 6672733082273363313, + 190938676320003294, + 1613225933948270736, + 3549006224849989171, + 12169032187873197425, + ]); + + assert_eq!(output, expected); + } + + #[test] + fn goldilocks16() { + let input: [Goldilocks; 16] = Goldilocks::new_array([ + 13216135600341032847, + 15626390207663319651, + 2052474569300149934, + 4375663431730581786, + 16596827905941257435, + 10019626608444427271, + 7831946179065963230, + 17104499871144693506, + 9021930732511690478, + 6899419210615882449, + 8131182521761419514, + 432489675596019804, + 8508050013409958723, + 14134506582804571789, + 13283546413390931641, + 14711125975653831032, + ]); + + let output = MdsMatrixGoldilocks.permute(input); + + let expected: [Goldilocks; 16] = Goldilocks::new_array([ + 9484392671298797780, + 149770626972189150, + 12125722600598304117, + 15945232149672903756, + 13199929870021500593, + 18443980893262804946, + 317150800081307627, + 16910019239751125049, + 1996802739033818490, + 11668458913264624237, + 11078800762167869397, + 13758408662406282356, + 11119677412113674380, + 7344117715971661026, + 4202436890275702092, + 681166793519210465, + ]); + + assert_eq!(output, expected); + } + + #[test] + fn goldilocks24() { + let input: [Goldilocks; 24] = Goldilocks::new_array([ + 11426771245122339662, + 5975488243963332229, + 11441424994503305651, + 5755561333702259678, + 7295454168648181339, + 16724279929816174064, + 32359231037136391, + 3713621595270370753, + 8421765959140936778, + 12370571593326246544, + 8633733294559731287, + 12765436832373161027, + 15606692828890413034, + 8068160018166226874, + 10719661629577139538, + 13036735610140127982, + 10213543772818211674, + 8041886705706266368, + 12022983417703446028, + 4179370708601587579, + 11125302089484330465, + 9904943018174649533, + 16178194376951442671, + 1545799842160818502, + ]); + + let output = MdsMatrixGoldilocks.permute(input); + + let expected: [Goldilocks; 24] = Goldilocks::new_array([ + 18431075688485197060, + 14823984346528185622, + 7262979358411339215, + 14816911393874702213, + 6721523710303409972, + 10829861327716364029, + 2456948878733883601, + 11088379938350287658, + 3820735023521527858, + 9062288923770492958, + 5159244568306327366, + 1401669669887165869, + 11908734248351870182, + 10640195377186320543, + 6552733980894593378, + 17103376282032495459, + 5204287788603805758, + 17783185518697631139, + 9006863878586007300, + 11122535637762904803, + 5271621316102699962, + 9734499541452484536, + 11778274360927642637, + 3217831681350496533, + ]); + + assert_eq!(output, expected); + } + + #[test] + fn goldilocks32() { + let input: [Goldilocks; 32] = Goldilocks::new_array([ + 8401806579759049284, + 14709608922272986544, + 8130995604641968478, + 7833133203357642391, + 10700492548100684406, + 3941105252506602047, + 8122370916776133262, + 15079919378435648206, + 8774521769784086994, + 16794844316583392853, + 9356562741425567167, + 13317198313361936216, + 7187680218428599522, + 16525662096158660997, + 540453741156061014, + 16543585577270698663, + 3802215918136285729, + 11389297895303247764, + 5133769394766075512, + 1057795099426170863, + 18037861421172314665, + 17632255188776359310, + 17616515088477043142, + 13307921676744533876, + 17602277262015191215, + 15819040654617566738, + 11961318546000835928, + 15593174310433874065, + 9152657050882549004, + 4801868480369948110, + 13202076339494141066, + 726396847460932316, + ]); + + let output = MdsMatrixGoldilocks.permute(input); + + let expected: [Goldilocks; 32] = Goldilocks::new_array([ + 1179701925859507209, + 5543239597787055637, + 5978278622530964070, + 3622388166841103287, + 11383243182536830899, + 14719109850604985734, + 17672601866826623850, + 4879627080283827596, + 7556887460241466109, + 9548493506061808122, + 13980851986825291174, + 2029844508485082398, + 10375517623784134775, + 13067093881736606569, + 6446569064196467795, + 15375603814779462714, + 11307946648742033371, + 1593906954637160608, + 5776169226282316678, + 8167048017892669861, + 3954052226208277367, + 9346878497567392707, + 5570872870988220142, + 10792661164389799960, + 17494962593174487938, + 7080549557843445752, + 14059834522311268132, + 17747288366997773235, + 17158122400620315305, + 6816598002359267850, + 12363049840026116993, + 13313901185845854868, + ]); + + assert_eq!(output, expected); + } + + #[test] + fn goldilocks64() { + let input: [Goldilocks; 64] = Goldilocks::new_array([ + 3471075506106776899, + 4817046918282259009, + 3480368692354016145, + 18110937755057600106, + 3130862083451221140, + 15376650156021437015, + 7997596749112997445, + 7742916918728590149, + 421644639408377358, + 2491271421424548020, + 1940196613872160755, + 7152053147988203177, + 13697425352450853423, + 15877844788345672674, + 17787098720906653510, + 6857627524724866519, + 8541180216786820396, + 10769715704553877654, + 9265712399189924160, + 10220120296438955872, + 18201417281995610945, + 6749698931189855822, + 13700000989116811950, + 13205437213697578097, + 10514342943989454609, + 9926015350795325725, + 2289808224483690257, + 12598806357998460973, + 14393945610969324307, + 4744625557965362093, + 2270701163031951561, + 2927942398784334090, + 5250916386894733430, + 4030189910566345872, + 4953663590324639075, + 1241519685782896035, + 8681312160951359069, + 8236353015475387411, + 4972690458759871996, + 1396852754187463352, + 17512022752774329733, + 14009268822557836700, + 1346736409027879377, + 7609463340861239931, + 10701512803758419515, + 5067199073587389986, + 5030018986055211116, + 17692625804700013551, + 9992938630604785132, + 15350127009762647067, + 10247405821493235386, + 15172888833500531069, + 14657693742399622179, + 7391511805216089127, + 2035742693690795598, + 4047216012963057952, + 12602085105939403203, + 16985723692990258059, + 12141021186082151434, + 3174646196626212833, + 16484520987666295947, + 10579720164460442970, + 9596917135039689219, + 13761818390665814258, + ]); + + let output = MdsMatrixGoldilocks.permute(input); + + let expected: [Goldilocks; 64] = Goldilocks::new_array([ + 9158798369861934356, + 9224859686427886689, + 16948559910286211274, + 15765762765140902574, + 16202509467561200764, + 1911749439284071529, + 4607026757869726805, + 8473827004973131317, + 13716800466551879373, + 6670177022201597800, + 17416833238376299449, + 14953676562252669578, + 5828107070718286209, + 17980287408679531241, + 2220583438808757820, + 14564318040622847100, + 3950519594558514416, + 12164610170526828198, + 457385640833960098, + 14068973922383216628, + 9614382247226943793, + 3932756878771319222, + 12728498054939249570, + 9435109056498897661, + 7283114805836756402, + 1720178259138435097, + 11496602000538177285, + 7736206812858942065, + 14289784438950643645, + 12052665489155550962, + 12918409840610303255, + 5224324424989208352, + 7826309014606327907, + 11657314889847733528, + 13899641072303006348, + 7501780959676548477, + 1064261716045449147, + 1487682458939665452, + 10894217148983862136, + 12785338167343566981, + 8043323074629160032, + 10852328074701301213, + 15029722608724150267, + 2611937278660861263, + 13995790409949796943, + 7103138700054564899, + 12756778219044204581, + 4147399997707606088, + 11930966590061754579, + 16708700985380478903, + 2370160521342035603, + 14893791582608133454, + 15313288276425450946, + 16224601303711716386, + 4488931442519177087, + 7443169181907410918, + 12381442753785370161, + 16366345507676500076, + 8097905256807642731, + 8504207502183388457, + 11400931328719780407, + 10879211614969476303, + 7265889003783205111, + 7322738272300165489, + ]); + + assert_eq!(output, expected); + } + + #[test] + fn goldilocks68() { + let input: [Goldilocks; 68] = Goldilocks::new_array([ + 16450563043143968653, + 3688080826640678185, + 133253417037384537, + 17501558583799613353, + 14920674569425704293, + 5030578721963251055, + 9795600398273758687, + 402012644192671817, + 10657312189068414445, + 9508835336085746575, + 16081669758721272608, + 2072823794278273547, + 16831381326702573736, + 11381683312293543190, + 5679539322738625588, + 9346499485038639332, + 15554202803455984983, + 18373955571490331663, + 11323895584334729789, + 16834542679468148445, + 14751528164286075953, + 3755158780970327991, + 12622814707645103582, + 10329238611694882547, + 7642766530280843057, + 4876120096290984742, + 412912224820604426, + 9118233770240274553, + 3626520971021993076, + 10841049054903806738, + 18205546599950141835, + 7198482606375262809, + 17183313930831625294, + 10181033256431249241, + 1061211413812819905, + 3980261141891682525, + 5674176959446948353, + 6062696542969845681, + 3383081006315025715, + 8812665902421024067, + 3093645099818246186, + 16178737149039707082, + 8204245222345541411, + 11072582337937050490, + 17969785901925882398, + 4670890092981706609, + 12537558683977529426, + 12084598516323376868, + 16293685096019175644, + 10117612240421467846, + 17873102395739074620, + 11220493906741851877, + 4632957003022201019, + 12934229307704669322, + 2152792796882257594, + 12521131928134126701, + 17472006670677761650, + 4560570065837283016, + 6315543803073912887, + 4098689719955359793, + 1784883877365258237, + 6837590090927294950, + 2391417016765166652, + 16389291664603960875, + 12285946887702044436, + 7231705445010258971, + 12976071926225281356, + 8829402645443096358, + ]); + + let output = MdsMatrixGoldilocks.permute(input); + + let expected: [Goldilocks; 68] = Goldilocks::new_array([ + 4984914285749049383, + 10397959071664799177, + 3331616814639908945, + 4252459885611162121, + 5517786723806029201, + 1826620401370703815, + 8257849352373689773, + 1722805960790112693, + 17654983138917187833, + 7542660006721409612, + 1970182718241277021, + 12865815507550811641, + 17507096607056552658, + 7988714902687660369, + 150082662759625574, + 17329095993317360383, + 965880604543562997, + 2820931239306841741, + 1980667983336380501, + 3781794112174728826, + 7323192150179872391, + 12243426826276589932, + 315076483410634889, + 3221894784246078707, + 3515955216509190252, + 964376148920419876, + 7679719864273407732, + 2516714701741920303, + 4837221266652621366, + 15301563603415983061, + 10380321314559647625, + 3023678426639670063, + 12020917879204725519, + 10595808165609787680, + 14199186729378048831, + 4520610719509879248, + 9983949546821718635, + 5066092593424854949, + 13843503196305181790, + 14296362815835302652, + 6766348697864530153, + 13804582129741554661, + 8032169955336281598, + 5198513488794721460, + 10613667919514788349, + 7948289550930596506, + 14118391408956101449, + 4356952068887595371, + 709878153008378134, + 17168579964784489802, + 17840495726541494819, + 2710471020841761312, + 9950159372116756450, + 3909574932971200058, + 2430964021804554670, + 6035162446515244642, + 14656543530572478095, + 1539013407173403800, + 4150113154618904744, + 4904646199269229662, + 17257014030727492672, + 3791823431764085889, + 13680668409434600948, + 12367427987617118934, + 12462908457168650050, + 10891613749697412017, + 6867760775372053830, + 12474954319307005079, + ]); + + assert_eq!(output, expected); + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs new file mode 100644 index 000000000..89da79e45 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs @@ -0,0 +1,1143 @@ +//! Poseidon1 permutation for Goldilocks. +//! +//! # Overview +//! +//! This module provides the Poseidon1 hash permutation instantiated for the +//! Goldilocks field (p = 2^64 - 2^32 + 1). The public API is a single type +//! alias that transparently dispatches to the best available implementation. +//! +//! # Platform Dispatch +//! +//! On **aarch64**, the type alias resolves to a dual-dispatch wrapper: +//! scalar permutations use NEON-accelerated MDS for full rounds with +//! LLVM-optimized sparse partial rounds, while packed NEON permutations +//! use the fused dual-lane ASM path (w8) or per-lane scalar path (w12). +//! +//! On **all other platforms**, it resolves to the generic Poseidon1 +//! implementation with Karatsuba MDS convolution. +//! +//! No `#[cfg]` is needed in calling code. +//! +//! # MDS Matrix +//! +//! The MDS matrix is a **circulant** matrix sourced from the MDS crate. +//! At runtime, it is applied via fast Karatsuba convolution (sub-O(t^2)). +//! During initialization only, it is expanded to dense form for the +//! sparse matrix decomposition of partial rounds. +//! +//! # Round Constants +//! +//! Generated by the Grain LFSR (Poseidon1 paper, Appendix E) with SBOX=0 (x^alpha encoding). + +use p3_poseidon1::{ + Poseidon1, Poseidon1Constants, Poseidon1ExternalLayerGeneric, Poseidon1InternalLayerGeneric, +}; + +use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL}; +use crate::{Goldilocks, MdsMatrixGoldilocks}; + +/// S-box degree for Goldilocks Poseidon1. +/// +/// The S-box raises each element to this power. The Goldilocks prime +/// factors as `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. Neither 3 nor 5 +/// are coprime to `p - 1`, so the smallest valid exponent is 7. +pub const GOLDILOCKS_S_BOX_DEGREE: u64 = 7; + +/// Number of full rounds per half for Goldilocks Poseidon (`RF / 2`). +/// +/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending). +/// Follows the Poseidon paper's security analysis (Section 5.4) with a +2 RF margin. +pub const GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS: usize = 4; + +/// Number of partial rounds for Goldilocks Poseidon (width 8). +/// +/// Derived from the interpolation bound in the Poseidon paper (Eq. 3): +/// +/// R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5 +/// = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20 +/// +/// With the +7.5% security margin (Section 5.4): ⌈1.075 × 20⌉ = 22. +pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8: usize = 22; + +/// Number of partial rounds for Goldilocks Poseidon (width 12). +/// +/// Same interpolation bound as width 8: +/// +/// R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20 +/// +/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22. +pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12: usize = 22; + +/// Generic (non-fused) Poseidon1 permutation for Goldilocks. +/// +/// Uses the platform-independent Poseidon1 implementation with Karatsuba +/// MDS convolution. Used directly for widths not supported by the fused +/// type (e.g. 16, 24) and as the non-aarch64 fallback for widths 8 and 12. +pub type Poseidon1GoldilocksGeneric = Poseidon1< + Goldilocks, + Poseidon1ExternalLayerGeneric, + Poseidon1InternalLayerGeneric, + WIDTH, + GOLDILOCKS_S_BOX_DEGREE, +>; + +/// Unified Poseidon1 permutation for Goldilocks. +/// +/// On aarch64, resolves to a dual-dispatch wrapper: scalar permutations +/// use NEON MDS for full rounds with sparse partial rounds, packed NEON +/// permutations use fused dual-lane ASM (w8) or per-lane scalar (w12). +/// +/// On all other platforms, resolves to the generic implementation with +/// Karatsuba MDS convolution. +/// +/// Supports both scalar and packed state representations transparently. +#[cfg(target_arch = "aarch64")] +pub type Poseidon1Goldilocks = crate::Poseidon1GoldilocksDispatch; + +/// Unified Poseidon1 permutation for Goldilocks. +/// +/// On aarch64, resolves to the fused ASM-optimized implementation that +/// uses inline assembly and dual-lane NEON processing. +/// +/// On all other platforms, resolves to the generic implementation with +/// Karatsuba MDS convolution. +/// +/// Supports both scalar and packed state representations transparently. +#[cfg(not(target_arch = "aarch64"))] +pub type Poseidon1Goldilocks = Poseidon1GoldilocksGeneric; + +/// Round constants for width-8 Poseidon1 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 +/// +/// Generated by `poseidon/generate_constants.py --field goldilocks --width 8`. +/// +/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)]. +pub const GOLDILOCKS_POSEIDON1_RC_8: [[Goldilocks; 8]; 30] = Goldilocks::new_2d_array([ + // Initial full rounds (4) + [ + 0xdd5743e7f2a5a5d9, + 0xcb3a864e58ada44b, + 0xffa2449ed32f8cdc, + 0x42025f65d6bd13ee, + 0x7889175e25506323, + 0x34b98bb03d24b737, + 0xbdcc535ecc4faa2a, + 0x5b20ad869fc0d033, + ], + [ + 0xf1dda5b9259dfcb4, + 0x27515210be112d59, + 0x4227d1718c766c3f, + 0x26d333161a5bd794, + 0x49b938957bf4b026, + 0x4a56b5938b213669, + 0x1120426b48c8353d, + 0x6b323c3f10a56cad, + ], + [ + 0xce57d6245ddca6b2, + 0xb1fc8d402bba1eb1, + 0xb5c5096ca959bd04, + 0x6db55cd306d31f7f, + 0xc49d293a81cb9641, + 0x1ce55a4fe979719f, + 0xa92e60a9d178a4d1, + 0x002cc64973bcfd8c, + ], + [ + 0xcea721cce82fb11b, + 0xe5b55eb8098ece81, + 0x4e30525c6f1ddd66, + 0x43c6702827070987, + 0xaca68430a7b5762a, + 0x3674238634df9c93, + 0x88cee1c825e33433, + 0xde99ae8d74b57176, + ], + // Partial rounds (22) + [ + 0x488897d85ff51f56, + 0x1140737ccb162218, + 0xa7eeb9215866ed35, + 0x9bd2976fee49fcc9, + 0xc0c8f0de580a3fcc, + 0x4fb2dae6ee8fc793, + 0x343a89f35f37395b, + 0x223b525a77ca72c8, + ], + [ + 0x56ccb62574aaa918, + 0xc4d507d8027af9ed, + 0xa080673cf0b7e95c, + 0xf0184884eb70dcf8, + 0x044f10b0cb3d5c69, + 0xe9e3f7993938f186, + 0x1b761c80e772f459, + 0x606cec607a1b5fac, + ], + [ + 0x14a0c2e1d45f03cd, + 0x4eace8855398574f, + 0xf905ca7103eff3e6, + 0xf8c8f8d20862c059, + 0xb524fe8bdd678e5a, + 0xfbb7865901a1ec41, + 0x014ef1197d341346, + 0x9725e20825d07394, + ], + [ + 0xfdb25aef2c5bae3b, + 0xbe5402dc598c971e, + 0x93a5711f04cdca3d, + 0xc45a9a5b2f8fb97b, + 0xfe8946a924933545, + 0x2af997a27369091c, + 0xaa62c88e0b294011, + 0x058eb9d810ce9f74, + ], + [ + 0xb3cb23eced349ae4, + 0xa3648177a77b4a84, + 0x43153d905992d95d, + 0xf4e2a97cda44aa4b, + 0x5baa2702b908682f, + 0x082923bdf4f750d1, + 0x98ae09a325893803, + 0xf8a6475077968838, + ], + [ + 0xceb0735bf00b2c5f, + 0x0a1a5d953888e072, + 0x2fcb190489f94475, + 0xb5be06270dec69fc, + 0x739cb934b09acf8b, + 0x537750b75ec7f25b, + 0xe9dd318bae1f3961, + 0xf7462137299efe1a, + ], + [ + 0xb1f6b8eee9adb940, + 0xbdebcc8a809dfe6b, + 0x40fc1f791b178113, + 0x3ac1c3362d014864, + 0x9a016184bdb8aeba, + 0x95f2394459fbc25e, + 0xe3f34a07a76a66c2, + 0x8df25f9ad98b1b96, + ], + [ + 0x85ffc27171439d9d, + 0xddcb9a2dcfd26910, + 0x26b5ba4bf3afb94e, + 0xffff9cc7c7651e2f, + 0x8c88364698280b55, + 0xebc114167b910501, + 0x2d77b4d89ecfb516, + 0x332e0828eba151f2, + ], + [ + 0x46fa6a6450dd4735, + 0xd00db7dd92384a33, + 0x5fd4fb751f3a5fc5, + 0x496fb90c0bb65ea2, + 0xf3baec0bb87cc5c7, + 0x862a3c0a7d4c7713, + 0xbf5f38336a3f47d8, + 0x41ad9dbc1394a20c, + ], + [ + 0xcc535945b7dbf0f7, + 0x82af2bc93685bcec, + 0x8e4c8d0c8cebfccd, + 0x17cb39417e84597e, + 0xd4a965a8c749b232, + 0xa2cab040f33f3ee5, + 0xa98811a1fed4e3a6, + 0x1cc48b54f377e2a1, + ], + [ + 0xe40cd4f6c5609a27, + 0x11de79ebca97a4a4, + 0x9177c73d8b7e929d, + 0x2a6fe8085797e792, + 0x3de6e93329f8d5ae, + 0x3f7af9125da962ff, + 0xd710682cfc77d3ac, + 0x48faf05f3b053cf4, + ], + [ + 0x287db8630da89c8b, + 0x4d0de32053cb30e9, + 0x8b37a4f20c5ada7b, + 0xe7cc6ebe78c84ecf, + 0x240bdc0a66a2610d, + 0x8299e7f02caa1650, + 0x380a53fefb6e754e, + 0x684a1d8cf8eb6810, + ], + [ + 0xe839452eb4b8a5e1, + 0xb03fa62e90626af4, + 0x11a688602fbc5efc, + 0x30dda75c355a2d62, + 0x0f712adcb73810de, + 0xffdc1102187f1ae1, + 0x40c34f398254b99c, + 0xede021b9dc289a4a, + ], + [ + 0x8b7b05225c4e7dad, + 0x3bc794346f9d9ff9, + 0xfccb5a57f2ca86ff, + 0xbb1502015a7da9d4, + 0xd7e0a35d4352a015, + 0x27af7a44f8160931, + 0xc37442f6782f4615, + 0xbdf392a9bd095dcb, + ], + [ + 0xc17f55037cf00de9, + 0xbcffedd34c71a874, + 0x5eb45d2a8133d1f2, + 0xbabe251e1612ebdf, + 0x3efeb9fbe438c536, + 0x2d7cef97b4afe1cf, + 0xe5de1b4660016c0b, + 0xcdcc26c332f5657c, + ], + [ + 0xe01dd653daf15809, + 0xb0a6bdd4b41094b5, + 0x27eac858b0b03a05, + 0x51d43b5e93adbdc0, + 0x8b89a23b0fea5fc9, + 0xdc8ac3b14f7f2fc1, + 0xe793f82f1efec039, + 0x9f6f2cf8969e7b80, + ], + [ + 0x49d45382e0f21d4a, + 0x5f4ad1797cd72786, + 0x4dc3dbebfd45f795, + 0x03a3ef84dba6e1bc, + 0x204bc9b3d3fc4c01, + 0x9ad706081e89b9ba, + 0x638bfb4d840e9f89, + 0x5ef2938cd095ae35, + ], + [ + 0x42cca18ebeb265c8, + 0xb7b2ec5c29aecbf8, + 0x0d84f9535dc78f0f, + 0x04e64ad942e77b8c, + 0xb4880dffffc9da0b, + 0x16db16d9c29adeb1, + 0x09bbaf2a0590cd1e, + 0x76460e74961fcf8d, + ], + [ + 0xed12a2276dfa1553, + 0x0b5acec5de0436fd, + 0x3c6cfea033a1f0a8, + 0x2b5ecefe546cac15, + 0x6e2d82884cd3bf6f, + 0xc134878d1add7b83, + 0x997963422eb7a280, + 0x5e834537ac648cf6, + ], + [ + 0x89e779214737c0b7, + 0x1a8c05e8581ad95b, + 0x8d18b72796437cf7, + 0xe7252c949e04b106, + 0x53267c4fd174585a, + 0xa16ef5d9c81dad47, + 0xda65191937270a46, + 0xcb2a5b55f2df664c, + ], + [ + 0x854aee2dc1924137, + 0xf37013c9d479ece6, + 0x0e163bc0630c4696, + 0x384ee64955048f76, + 0xf65d814e28ee4ec5, + 0xe57bc564fd82f1b1, + 0x4b338937b6876614, + 0x66ee0b04ed43cd8d, + ], + [ + 0x49884bf25f4ef15d, + 0xeb51fe28de1c6f54, + 0x2cd64e84fce8dfcc, + 0x29164a96a541a013, + 0x173ce7558f4cacb8, + 0xeb5b1ce5877c89e9, + 0x5faff4b0f5217bf6, + 0xac42d0b1c20f205e, + ], + // Terminal full rounds (4) + [ + 0xfb1d6bf0ca43221b, + 0x97b0a1b01d6a2955, + 0x08c60bd622952b30, + 0x43f2be0f9e24147c, + 0xfa7268b7d3730f5d, + 0x43a6c419a23983bb, + 0xcd77c1f7b29b113c, + 0xcfa43c9db8eec29f, + ], + [ + 0xcaaa95a6c7365dec, + 0x0a91193f798f3be0, + 0x1104497652735dc6, + 0x35aecb93663b515e, + 0x8dbc9916065aa858, + 0xada8f7a0266579ed, + 0x524dee7bec1ea789, + 0xa93aee9dd5af9521, + ], + [ + 0x9d1f1b54750d707e, + 0x7c9feab87096d5dc, + 0xa2e1fb19f9d4261b, + 0xb714deb448de6346, + 0x225d1f0d011c5403, + 0x1549b7f1d28cedc0, + 0xaef3e46f97d43942, + 0x6dfc7ffe0b38bf08, + ], + [ + 0x7de853fdc542b663, + 0xa68ecc96610657b2, + 0xe88bb5428af289b1, + 0xd7cfa1504c5569f5, + 0x78a9aad0d642d30a, + 0xd68315f2353dce52, + 0x46e56300f86fcfd5, + 0x323d95332b145fd6, + ], +]); + +/// Round constants for width-12 Poseidon1 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 +/// +/// Generated by `poseidon/generate_constants.py --field goldilocks --width 12`. +/// +/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)]. +pub const GOLDILOCKS_POSEIDON1_RC_12: [[Goldilocks; 12]; 30] = Goldilocks::new_2d_array([ + // Initial full rounds (4) + [ + 0x13dcf33aba214f46, + 0x30b3b654a1da6d83, + 0x1fc634ada6159b56, + 0x937459964dc03466, + 0xedd2ef2ca7949924, + 0xede9affde0e22f68, + 0x8515b9d6bac9282d, + 0x6b5c07b4e9e900d8, + 0x1ec66368838c8a08, + 0x9042367d80d1fbab, + 0x400283564a3c3799, + 0x4a00be0466bca75e, + ], + [ + 0x7913beee58e3817f, + 0xf545e88532237d90, + 0x22f8cb8736042005, + 0x6f04990e247a2623, + 0xfe22e87ba37c38cd, + 0xd20e32c85ffe2815, + 0x117227674048fe73, + 0x4e9fb7ea98a6b145, + 0xe0866c232b8af08b, + 0x00bbc77916884964, + 0x7031c0fb990d7116, + 0x240a9e87cf35108f, + ], + [ + 0x2e6363a5a12244b3, + 0x5e1c3787d1b5011c, + 0x4132660e2a196e8b, + 0x3a013b648d3d4327, + 0xf79839f49888ea43, + 0xfe85658ebafe1439, + 0xb6889825a14240bd, + 0x578453605541382b, + 0x4508cda8f6b63ce9, + 0x9c3ef35848684c91, + 0x0812bde23c87178c, + 0xfe49638f7f722c14, + ], + [ + 0x8e3f688ce885cbf5, + 0xb8e110acf746a87d, + 0xb4b2e8973a6dabef, + 0x9e714c5da3d462ec, + 0x6438f9033d3d0c15, + 0x24312f7cf1a27199, + 0x23f843bb47acbf71, + 0x9183f11a34be9f01, + 0x839062fbb9d45dbf, + 0x24b56e7e6c2e43fa, + 0xe1683da61c962a72, + 0xa95c63971a19bfa7, + ], + // Partial rounds (22) + [ + 0x4adf842aa75d4316, + 0xf8fbb871aa4ab4eb, + 0x68e85b6eb2dd6aeb, + 0x07a0b06b2d270380, + 0xd94e0228bd282de4, + 0x8bdd91d3250c5278, + 0x209c68b88bba778f, + 0xb5e18cdab77f3877, + 0xb296a3e808da93fa, + 0x8370ecbda11a327e, + 0x3f9075283775dad8, + 0xb78095bb23c6aa84, + ], + [ + 0x3f36b9fe72ad4e5f, + 0x69bc96780b10b553, + 0x3f1d341f2eb7b881, + 0x4e939e9815838818, + 0xda366b3ae2a31604, + 0xbc89db1e7287d509, + 0x6102f411f9ef5659, + 0x58725c5e7ac1f0ab, + 0x0df5856c798883e7, + 0xf7bb62a8da4c961b, + 0xc68be7c94882a24d, + 0xaf996d5d5cdaedd9, + ], + [ + 0x9717f025e7daf6a5, + 0x6436679e6e7216f4, + 0x8a223d99047af267, + 0xbb512e35a133ba9a, + 0xfbbf44097671aa03, + 0xf04058ebf6811e61, + 0x5cca84703fac7ffb, + 0x9b55c7945de6469f, + 0x8e05bf09808e934f, + 0x2ea900de876307d7, + 0x7748fff2b38dfb89, + 0x6b99a676dd3b5d81, + ], + [ + 0xac4bb7c627cf7c13, + 0xadb6ebe5e9e2f5ba, + 0x2d33378cafa24ae3, + 0x1e5b73807543f8c2, + 0x09208814bfebb10f, + 0x782e64b6bb5b93dd, + 0xadd5a48eac90b50f, + 0xadd4c54c736ea4b1, + 0xd58dbb86ed817fd8, + 0x6d5ed1a533f34ddd, + 0x28686aa3e36b7cb9, + 0x591abd3476689f36, + ], + [ + 0x047d766678f13875, + 0xa2a11112625f5b49, + 0x21fd10a3f8304958, + 0xf9b40711443b0280, + 0xd2697eb8b2bde88e, + 0x3493790b51731b3f, + 0x11caf9dd73764023, + 0x7acfb8f72878164e, + 0x744ec4db23cefc26, + 0x1e00e58f422c6340, + 0x21dd28d906a62dda, + 0xf32a46ab5f465b5f, + ], + [ + 0xbfce13201f3f7e6b, + 0xf30d2e7adb5304e2, + 0xecdf4ee4abad48e9, + 0xf94e82182d395019, + 0x4ee52e3744d887c5, + 0xa1341c7cac0083b2, + 0x2302fb26c30c834a, + 0xaea3c587273bf7d3, + 0xf798e24961823ec7, + 0x962deba3e9a2cd94, + 0xb36ee79485ca4707, + 0xd380199eddd2de52, + ], + [ + 0x70971fc4e6f85305, + 0x8e722f6e5dc32699, + 0xa0883df133052b92, + 0x8f86c6a3eb7d01a4, + 0x763649c8b670bdc5, + 0x830d5c82b808759b, + 0xaa1da8bb91da02e7, + 0x9bc9bf629e211c4d, + 0x0f0a899b10a4dea8, + 0xb883bdcee7c6b356, + 0x78c7101e7496ae1e, + 0x2fd6c5a8bf1e5ca6, + ], + [ + 0xe2a6e06e61fcec9c, + 0xebfce7d5c5b3dbd5, + 0xca2eeca4bb485d85, + 0xc2b875537c42eb69, + 0x6faf849976873328, + 0xfc3fcb6e81ad4cc3, + 0x180dd95503955a28, + 0xd40f19a3c9fe1520, + 0x49d178ddbf7fd96d, + 0x3950bee2e10e0297, + 0x437b90cf295be062, + 0xa5cd126edffad23b, + ], + [ + 0xdf58134c134491c2, + 0x0677eca229d9f7bd, + 0x492200a1f7d83a3c, + 0xafb58c9810a43645, + 0x7659077c5a9c208e, + 0x30b4bc83706995cd, + 0xc98fa77bbbef3a3b, + 0x84a82905750b3109, + 0x72f2a02326aeb69b, + 0x8d27a2a2d73a848a, + 0xaa9e30a80bde4b68, + 0x63abb1415e050474, + ], + [ + 0x1c4bd1e816050a7e, + 0x15d1502e4f469dfd, + 0x53989d594b0c4cd8, + 0x7a1a4c83cb7e377e, + 0x1b52f8a9944e480e, + 0xeb7b03f76a91a79e, + 0x0073a4fc9328c69e, + 0x2c7b16f8620d9de4, + 0x950d052963e46bc4, + 0x8d201ba1a9c89fac, + 0xd3502941bdf35503, + 0x7c6dfcd5af8676fb, + ], + [ + 0xf8a6cd02e92cdb0b, + 0x6e7500f3a5464b22, + 0x07637eabba4bdd20, + 0x88b82717beee0e14, + 0xbaa2b1cd3dd4c79a, + 0xdfecc3aebec4cfa6, + 0x7561087b0cff0166, + 0x538fcac317a703a6, + 0xd7d6c6eeeeeeea19, + 0xd647b1ee441658a0, + 0xdf4442110236c546, + 0x559ef2c6dd73ec15, + ], + [ + 0x4c0f5fc6c0dda3d1, + 0x685010cc3100cea7, + 0x2fb6ba8aa0344440, + 0xb515f0a3ca75f1fb, + 0x886887eaecb87c10, + 0xf03ec3fd710abb04, + 0xd3b4763e17f543ef, + 0x50d9e5716e78083a, + 0x0bce2385cf8d74ff, + 0xaf23032cd5f0e04b, + 0xd366aa112b6159d9, + 0x810a3ad3ac7979db, + ], + [ + 0x0a4a11d794be40a2, + 0xeebf0cf23b668a3f, + 0x600873fb011d761b, + 0x0bfb5591a02ff618, + 0xa16e2a528910af52, + 0xf6553653e2878421, + 0xccbe7c7a601a30c0, + 0xb18b214fe489f5b3, + 0xe21017ab9e153425, + 0x586099ede17af9a6, + 0x385078b514f50647, + 0xc02b3a9afb89883d, + ], + [ + 0x6d3fbd3b4a9f1de6, + 0x4b4d40a41b0f473c, + 0x838f1887b8f31711, + 0x9396895be5c58a41, + 0x6247a479d66fc2e3, + 0x13fe228a98f2d0a2, + 0x5ba5fde765f9481e, + 0xafb89fa62267e117, + 0xfa4dc1bebcaa6333, + 0xdbab590882b87289, + 0xc3b6c08e23ba9301, + 0xd84b5de94a324fb7, + ], + [ + 0x0d0c371c5b35b850, + 0x7964f570e7188038, + 0x5daf18bbd996604c, + 0x6743bc47b9595258, + 0x5528b9362c59bb71, + 0xac45e25b7127b68c, + 0xa2077d7dfbb606b6, + 0xf3faac6faee378af, + 0x0c6388b51545e884, + 0xd27dbb6944917b61, + 0x89bcac584344c104, + 0x856bab802ce7402d, + ], + [ + 0x2cff3000be1fcd0a, + 0x765f2977fa72a917, + 0x1443711329f5f9d5, + 0xd35cd0261af2f951, + 0x2a1bb986084ec281, + 0x2334a54b758f23f2, + 0xa9b8cb612caf706b, + 0xb6ba11c4ab1a1017, + 0xde96b0824b4b46e2, + 0xc59d4272c6d92e2c, + 0x389bb5107611754d, + 0x23647fbc77657372, + ], + [ + 0xd5ef60d6f76a42fa, + 0xebb406bb79ac9819, + 0x55faccc709a2f423, + 0xd9d6ea97490091cd, + 0xef3ce5069647a7e4, + 0xdf31625d3fa78464, + 0x242e60fd68f10f66, + 0x39c966cc815f084d, + 0x20e2e22e02bae3f7, + 0xb38919d3f1173d7c, + 0xf17769f6c77084d9, + 0xcc051d8094cac41f, + ], + [ + 0x942069f5d6eece7e, + 0x8d61d3e6f141c572, + 0xc5cef9d85dd605f4, + 0x938f2ac2bf885997, + 0x23bddbace7c48f6c, + 0xc90a6c5ba98537e4, + 0x0be6ee2cca90f6ae, + 0xa026175394ae0e90, + 0x29fca3e314c77628, + 0x2aa2aa8738ab7b77, + 0xe11bbd31fbb8cac6, + 0xb5bbbef1b78a23af, + ], + [ + 0x8b62a5551e9a9797, + 0x3f91073d4d491c80, + 0x4cfa44976396424a, + 0xf8dcb2dfb3aa1b44, + 0x3849409eba1a95f5, + 0x070845799f234380, + 0x184c0093667da1ba, + 0xbd66aafccd51601e, + 0xee6d14e92155b490, + 0x626f2ec1865bc544, + 0x1bd2854bf6485986, + 0x368b8497472f12ef, + ], + [ + 0x4f88cdcdfb791921, + 0xe2c0acfeda9ae781, + 0x9739bc21773469b3, + 0x00ce3ad64dc4bb8f, + 0xaab85a321ee7a4c8, + 0xd5de825be97004f4, + 0x48d676d3a043b1c6, + 0x9c6180b1ff643097, + 0x34882a89dd590b09, + 0xae7e6b0d249c3b1d, + 0x8c016908a04885a1, + 0x83ebaaebc9ae0721, + ], + [ + 0xab21b42e0f642307, + 0xdb46631f62bb29c1, + 0xef29f0399e09b5d9, + 0x5b52fbb3613b8ba1, + 0x57e129fcc96922e6, + 0xcdeb14c9d9204b3a, + 0x1341ef0da8536e34, + 0xd7e3400f2bacde63, + 0x6911eeb42f70d7e5, + 0xc3a2a910a4679767, + 0x1773cbe4a0f6bb28, + 0xe17b0d53e843eab5, + ], + [ + 0x587fa39990b62800, + 0x0d5d32788135879d, + 0x277f7b31fd3a4cdb, + 0xa435290ee56d7efa, + 0xea6f40be35159925, + 0xcb73377a506171cb, + 0xe43c367ce731d82a, + 0x6eb305031ca10c43, + 0xc019a8c622cc84cb, + 0xd5614f5658c612e6, + 0x7b1ecbe957c3ff98, + 0x60db6ee9651a8478, + ], + // Terminal full rounds (4) + [ + 0x9271d450fc9b4117, + 0xcffeea06b6e3aac1, + 0xfa4a44c748d1cd8e, + 0xe64db01ba569b469, + 0xd31005160e4045fe, + 0x39e0fa013e025f79, + 0xe243be574196a956, + 0x205b2a681e3d2642, + 0x79cae5ad93486bab, + 0xfdf567844e32c295, + 0x331679589bfb7189, + 0xaf06ee32297b89c2, + ], + [ + 0xa6bcae311e498491, + 0x9d16f52c96ac8b3e, + 0x48a674b59393fa35, + 0x0f9e65da3fde3796, + 0x1e098310fc84578c, + 0x559ae5fab1ae8dad, + 0x56bd4d624078881d, + 0xfd8bbbf8fbe817b5, + 0x82d30695c44df534, + 0x3ec0a97bc41127c5, + 0x1eb8b64adaa22078, + 0x82c45e418d60c983, + ], + [ + 0xb092280f484d55bf, + 0xcd317c9537697939, + 0xd3be2e352feb79f3, + 0xca6d866539a390e5, + 0xb5efb1a494e55ee6, + 0xfa9013ac89756e9e, + 0xaeb88efd1e981242, + 0x13ee477cdab6e0dc, + 0xce7df902c40da2d3, + 0xf3fbaf0d4e6f5f34, + 0xf96354ada6785f38, + 0x13b5692812406886, + ], + [ + 0xf03cae030a0f4418, + 0x7d3172887aa98e1a, + 0x8a2c2644f2faf7b9, + 0x80d721abee696d00, + 0x27c8b903a4d68267, + 0xaf0b7b12f90291b8, + 0x00acd08cfdff3817, + 0x4659ee496c634328, + 0xf5b25c10730dbff1, + 0xdde3a153297329c2, + 0x50c0b70d6910a44b, + 0x23c7426af725a6a0, + ], +]); + +/// Create the default width-8 Poseidon1 permutation for Goldilocks. +/// +/// Returns the platform-optimal implementation: dual-dispatch on aarch64 +/// (generic for scalar, fused ASM for packed), generic Karatsuba on all +/// other platforms. +#[cfg(target_arch = "aarch64")] +pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> { + let constants = Poseidon1Constants { + rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + mds_circ_col: MATRIX_CIRC_MDS_8_COL, + round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(), + }; + let (full, partial) = constants.to_optimized(); + let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial); + crate::Poseidon1GoldilocksDispatch::new(fused, full, partial) +} + +/// Create the default width-8 Poseidon1 permutation for Goldilocks. +/// +/// Returns the platform-optimal implementation: fused ASM on aarch64, +/// generic Karatsuba on all other platforms. +#[cfg(not(target_arch = "aarch64"))] +pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> { + Poseidon1::new(&Poseidon1Constants { + rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + mds_circ_col: MATRIX_CIRC_MDS_8_COL, + round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(), + }) +} + +/// Create the default width-12 Poseidon1 permutation for Goldilocks. +/// +/// Returns the platform-optimal implementation: dual-dispatch on aarch64 +/// (generic for scalar, fused ASM for packed), generic Karatsuba on all +/// other platforms. +#[cfg(target_arch = "aarch64")] +pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> { + let constants = Poseidon1Constants { + rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, + mds_circ_col: MATRIX_CIRC_MDS_12_COL, + round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(), + }; + let (full, partial) = constants.to_optimized(); + let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial); + crate::Poseidon1GoldilocksDispatch::new(fused, full, partial) +} + +/// Create the default width-12 Poseidon1 permutation for Goldilocks. +/// +/// Returns the platform-optimal implementation: fused ASM on aarch64, +/// generic Karatsuba on all other platforms. +#[cfg(not(target_arch = "aarch64"))] +pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> { + Poseidon1::new(&Poseidon1Constants { + rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, + mds_circ_col: MATRIX_CIRC_MDS_12_COL, + round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(), + }) +} + +#[cfg(test)] +mod tests { + use p3_symmetric::Permutation; + use rand::SeedableRng; + use rand::rngs::SmallRng; + + use super::*; + + type F = Goldilocks; + + /// Known-answer test for width 8 (sequential 0..7 input). + #[test] + fn test_poseidon_goldilocks_width_8() { + let perm = default_goldilocks_poseidon1_8(); + + let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]); + perm.permute_mut(&mut input); + + let expected: [F; 8] = F::new_array([ + 2431226948502761687, + 9427563026145807618, + 6827549936272051660, + 16907684411084503785, + 10131745626715172913, + 17448305483431576765, + 9066501914269485014, + 12095238468458521303, + ]); + assert_eq!(input, expected); + } + + /// Known-answer test for width 12 (sequential 0..11 input). + #[test] + fn test_poseidon_goldilocks_width_12() { + let perm = default_goldilocks_poseidon1_12(); + + let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + perm.permute_mut(&mut input); + + let expected: [F; 12] = F::new_array([ + 15595088881848875364, + 9564850329150784619, + 13607005230761744521, + 12117102595842533385, + 2814257411756993122, + 11640647689983397089, + 14363867760831937423, + 13323891071259596526, + 11219803511311150468, + 9221595262780869902, + 5898229059046891887, + 18181291031484020550, + ]); + assert_eq!(input, expected); + } + + /// Smoke test for width 16 with random constants. + /// Uses the generic type directly since the fused type only supports 8 and 12. + #[test] + fn test_poseidon_goldilocks_width_16() { + let mut rng = SmallRng::seed_from_u64(1); + let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng( + GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + &MdsMatrixGoldilocks, + &mut rng, + ); + let input: [F; 16] = rand::RngExt::random(&mut rng); + let output = poseidon.permute(input); + assert_ne!(output, input); + } + + /// Smoke test for width 24 with random constants. + #[test] + fn test_poseidon_goldilocks_width_24() { + let mut rng = SmallRng::seed_from_u64(1); + let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng( + GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + &MdsMatrixGoldilocks, + &mut rng, + ); + let input: [F; 24] = rand::RngExt::random(&mut rng); + let output = poseidon.permute(input); + assert_ne!(output, input); + } + + #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] + mod avx512 { + use super::*; + use crate::PackedGoldilocksAVX512; + + #[test] + fn test_avx512_poseidon_width_16() { + let mut rng = SmallRng::seed_from_u64(1); + let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng( + GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + &MdsMatrixGoldilocks, + &mut rng, + ); + let input: [F; 16] = rand::RngExt::random(&mut rng); + + let mut expected = input; + poseidon.permute_mut(&mut expected); + + let mut avx512_input = input.map(Into::::into); + poseidon.permute_mut(&mut avx512_input); + + let avx512_output = avx512_input.map(|x| x.0[0]); + assert_eq!(avx512_output, expected); + } + + #[test] + fn test_avx512_poseidon_width_24() { + let mut rng = SmallRng::seed_from_u64(1); + let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng( + GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + &MdsMatrixGoldilocks, + &mut rng, + ); + let input: [F; 24] = rand::RngExt::random(&mut rng); + + let mut expected = input; + poseidon.permute_mut(&mut expected); + + let mut avx512_input = input.map(Into::::into); + poseidon.permute_mut(&mut avx512_input); + + let avx512_output = avx512_input.map(|x| x.0[0]); + assert_eq!(avx512_output, expected); + } + } + + #[cfg(all( + target_arch = "x86_64", + target_feature = "avx2", + not(target_feature = "avx512f") + ))] + mod avx2 { + use super::*; + use crate::PackedGoldilocksAVX2; + + #[test] + fn test_avx2_poseidon_width_16() { + let mut rng = SmallRng::seed_from_u64(1); + let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng( + GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + &MdsMatrixGoldilocks, + &mut rng, + ); + let input: [F; 16] = rand::RngExt::random(&mut rng); + + let mut expected = input; + poseidon.permute_mut(&mut expected); + + let mut avx2_input = input.map(Into::::into); + poseidon.permute_mut(&mut avx2_input); + + let avx2_output = avx2_input.map(|x| x.0[0]); + assert_eq!(avx2_output, expected); + } + + #[test] + fn test_avx2_poseidon_width_24() { + let mut rng = SmallRng::seed_from_u64(1); + let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng( + GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, + GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, + &MdsMatrixGoldilocks, + &mut rng, + ); + let input: [F; 24] = rand::RngExt::random(&mut rng); + + let mut expected = input; + poseidon.permute_mut(&mut expected); + + let mut avx2_input = input.map(Into::::into); + poseidon.permute_mut(&mut avx2_input); + + let avx2_output = avx2_input.map(|x| x.0[0]); + assert_eq!(avx2_output, expected); + } + } + + #[cfg(target_arch = "aarch64")] + mod neon { + use super::*; + use crate::PackedGoldilocksNeon; + + #[test] + fn test_neon_poseidon_width_8() { + let perm = default_goldilocks_poseidon1_8(); + let input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]); + + let mut expected = input; + perm.permute_mut(&mut expected); + + let mut neon_input = input.map(Into::::into); + perm.permute_mut(&mut neon_input); + + let neon_output = neon_input.map(|x| x.0[0]); + assert_eq!(neon_output, expected); + } + + #[test] + fn test_neon_poseidon_width_12() { + let perm = default_goldilocks_poseidon1_12(); + let input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + + let mut expected = input; + perm.permute_mut(&mut expected); + + let mut neon_input = input.map(Into::::into); + perm.permute_mut(&mut neon_input); + + let neon_output = neon_input.map(|x| x.0[0]); + assert_eq!(neon_output, expected); + } + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs new file mode 100644 index 000000000..b5d158610 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs @@ -0,0 +1,980 @@ +//! Implementation of Poseidon2, see: https://eprint.iacr.org/2023/323 + +use alloc::vec::Vec; + +use p3_field::{Algebra, InjectiveMonomial, PrimeCharacteristicRing}; +#[cfg(not(target_arch = "aarch64"))] +use p3_poseidon2::Poseidon2; +use p3_poseidon2::{ + ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, GenericPoseidon2LinearLayers, + InternalLayer, InternalLayerConstructor, MDSMat4, add_rc_and_sbox_generic, + external_initial_permute_state, external_terminal_permute_state, internal_permute_state, + matmul_internal, +}; + +use crate::Goldilocks; +use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE; + +/// Number of full rounds per half for Goldilocks Poseidon2 (`RF / 2`). +/// +/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending). +/// Follows the Poseidon2 paper's security analysis with a +2 RF margin. +pub const GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS: usize = 4; + +/// Number of partial rounds for Goldilocks Poseidon2 (width 8). +/// +/// Derived from the interpolation bound in the Poseidon paper (Eq. 3): +/// +/// R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5 +/// = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20 +/// +/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22. +pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8: usize = 22; + +/// Number of partial rounds for Goldilocks Poseidon2 (width 12). +/// +/// Same interpolation bound as width 8: +/// +/// R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20 +/// +/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22. +pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_12: usize = 22; + +/// An implementation of the Poseidon2 hash function for the Goldilocks field. +/// +/// It acts on arrays of the form `[Goldilocks; WIDTH]`. +#[cfg(target_arch = "aarch64")] +pub type Poseidon2Goldilocks = crate::Poseidon2GoldilocksFused; + +/// An implementation of the Poseidon2 hash function for the Goldilocks field. +/// +/// It acts on arrays of the form `[Goldilocks; WIDTH]`. +#[cfg(not(target_arch = "aarch64"))] +pub type Poseidon2Goldilocks = Poseidon2< + Goldilocks, + Poseidon2ExternalLayerGoldilocks, + Poseidon2InternalLayerGoldilocks, + WIDTH, + GOLDILOCKS_S_BOX_DEGREE, +>; + +/// Round constants for width-8 Poseidon2 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 +/// +/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`. +/// +/// Layout: external_initial (4 rounds × 8 elements). +pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL: [[Goldilocks; 8]; 4] = [ + Goldilocks::new_array([ + 0xdd5743e7f2a5a5d9, + 0xcb3a864e58ada44b, + 0xffa2449ed32f8cdc, + 0x42025f65d6bd13ee, + 0x7889175e25506323, + 0x34b98bb03d24b737, + 0xbdcc535ecc4faa2a, + 0x5b20ad869fc0d033, + ]), + Goldilocks::new_array([ + 0xf1dda5b9259dfcb4, + 0x27515210be112d59, + 0x4227d1718c766c3f, + 0x26d333161a5bd794, + 0x49b938957bf4b026, + 0x4a56b5938b213669, + 0x1120426b48c8353d, + 0x6b323c3f10a56cad, + ]), + Goldilocks::new_array([ + 0xce57d6245ddca6b2, + 0xb1fc8d402bba1eb1, + 0xb5c5096ca959bd04, + 0x6db55cd306d31f7f, + 0xc49d293a81cb9641, + 0x1ce55a4fe979719f, + 0xa92e60a9d178a4d1, + 0x002cc64973bcfd8c, + ]), + Goldilocks::new_array([ + 0xcea721cce82fb11b, + 0xe5b55eb8098ece81, + 0x4e30525c6f1ddd66, + 0x43c6702827070987, + 0xaca68430a7b5762a, + 0x3674238634df9c93, + 0x88cee1c825e33433, + 0xde99ae8d74b57176, + ]), +]; + +/// Round constants for width-8 Poseidon2 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 +/// +/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`. +/// +/// Layout: external_final (4 rounds × 8 elements). +pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL: [[Goldilocks; 8]; 4] = [ + Goldilocks::new_array([ + 0x014ef1197d341346, + 0x9725e20825d07394, + 0xfdb25aef2c5bae3b, + 0xbe5402dc598c971e, + 0x93a5711f04cdca3d, + 0xc45a9a5b2f8fb97b, + 0xfe8946a924933545, + 0x2af997a27369091c, + ]), + Goldilocks::new_array([ + 0xaa62c88e0b294011, + 0x058eb9d810ce9f74, + 0xb3cb23eced349ae4, + 0xa3648177a77b4a84, + 0x43153d905992d95d, + 0xf4e2a97cda44aa4b, + 0x5baa2702b908682f, + 0x082923bdf4f750d1, + ]), + Goldilocks::new_array([ + 0x98ae09a325893803, + 0xf8a6475077968838, + 0xceb0735bf00b2c5f, + 0x0a1a5d953888e072, + 0x2fcb190489f94475, + 0xb5be06270dec69fc, + 0x739cb934b09acf8b, + 0x537750b75ec7f25b, + ]), + Goldilocks::new_array([ + 0xe9dd318bae1f3961, + 0xf7462137299efe1a, + 0xb1f6b8eee9adb940, + 0xbdebcc8a809dfe6b, + 0x40fc1f791b178113, + 0x3ac1c3362d014864, + 0x9a016184bdb8aeba, + 0x95f2394459fbc25e, + ]), +]; + +/// Round constants for width-8 Poseidon2 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 +/// +/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`. +/// +/// Layout: internal (22 scalar constants). +pub const GOLDILOCKS_POSEIDON2_RC_8_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([ + 0x488897d85ff51f56, + 0x1140737ccb162218, + 0xa7eeb9215866ed35, + 0x9bd2976fee49fcc9, + 0xc0c8f0de580a3fcc, + 0x4fb2dae6ee8fc793, + 0x343a89f35f37395b, + 0x223b525a77ca72c8, + 0x56ccb62574aaa918, + 0xc4d507d8027af9ed, + 0xa080673cf0b7e95c, + 0xf0184884eb70dcf8, + 0x044f10b0cb3d5c69, + 0xe9e3f7993938f186, + 0x1b761c80e772f459, + 0x606cec607a1b5fac, + 0x14a0c2e1d45f03cd, + 0x4eace8855398574f, + 0xf905ca7103eff3e6, + 0xf8c8f8d20862c059, + 0xb524fe8bdd678e5a, + 0xfbb7865901a1ec41, +]); + +/// Round constants for width-12 Poseidon2 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 +/// +/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`. +/// +/// Layout: external_initial (4 rounds × 12 elements). +pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL: [[Goldilocks; 12]; 4] = [ + Goldilocks::new_array([ + 0x13dcf33aba214f46, + 0x30b3b654a1da6d83, + 0x1fc634ada6159b56, + 0x937459964dc03466, + 0xedd2ef2ca7949924, + 0xede9affde0e22f68, + 0x8515b9d6bac9282d, + 0x6b5c07b4e9e900d8, + 0x1ec66368838c8a08, + 0x9042367d80d1fbab, + 0x400283564a3c3799, + 0x4a00be0466bca75e, + ]), + Goldilocks::new_array([ + 0x7913beee58e3817f, + 0xf545e88532237d90, + 0x22f8cb8736042005, + 0x6f04990e247a2623, + 0xfe22e87ba37c38cd, + 0xd20e32c85ffe2815, + 0x117227674048fe73, + 0x4e9fb7ea98a6b145, + 0xe0866c232b8af08b, + 0x00bbc77916884964, + 0x7031c0fb990d7116, + 0x240a9e87cf35108f, + ]), + Goldilocks::new_array([ + 0x2e6363a5a12244b3, + 0x5e1c3787d1b5011c, + 0x4132660e2a196e8b, + 0x3a013b648d3d4327, + 0xf79839f49888ea43, + 0xfe85658ebafe1439, + 0xb6889825a14240bd, + 0x578453605541382b, + 0x4508cda8f6b63ce9, + 0x9c3ef35848684c91, + 0x0812bde23c87178c, + 0xfe49638f7f722c14, + ]), + Goldilocks::new_array([ + 0x8e3f688ce885cbf5, + 0xb8e110acf746a87d, + 0xb4b2e8973a6dabef, + 0x9e714c5da3d462ec, + 0x6438f9033d3d0c15, + 0x24312f7cf1a27199, + 0x23f843bb47acbf71, + 0x9183f11a34be9f01, + 0x839062fbb9d45dbf, + 0x24b56e7e6c2e43fa, + 0xe1683da61c962a72, + 0xa95c63971a19bfa7, + ]), +]; + +/// Round constants for width-12 Poseidon2 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 +/// +/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`. +/// +/// Layout: external_final (4 rounds × 12 elements). +pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL: [[Goldilocks; 12]; 4] = [ + Goldilocks::new_array([ + 0xc68be7c94882a24d, + 0xaf996d5d5cdaedd9, + 0x9717f025e7daf6a5, + 0x6436679e6e7216f4, + 0x8a223d99047af267, + 0xbb512e35a133ba9a, + 0xfbbf44097671aa03, + 0xf04058ebf6811e61, + 0x5cca84703fac7ffb, + 0x9b55c7945de6469f, + 0x8e05bf09808e934f, + 0x2ea900de876307d7, + ]), + Goldilocks::new_array([ + 0x7748fff2b38dfb89, + 0x6b99a676dd3b5d81, + 0xac4bb7c627cf7c13, + 0xadb6ebe5e9e2f5ba, + 0x2d33378cafa24ae3, + 0x1e5b73807543f8c2, + 0x09208814bfebb10f, + 0x782e64b6bb5b93dd, + 0xadd5a48eac90b50f, + 0xadd4c54c736ea4b1, + 0xd58dbb86ed817fd8, + 0x6d5ed1a533f34ddd, + ]), + Goldilocks::new_array([ + 0x28686aa3e36b7cb9, + 0x591abd3476689f36, + 0x047d766678f13875, + 0xa2a11112625f5b49, + 0x21fd10a3f8304958, + 0xf9b40711443b0280, + 0xd2697eb8b2bde88e, + 0x3493790b51731b3f, + 0x11caf9dd73764023, + 0x7acfb8f72878164e, + 0x744ec4db23cefc26, + 0x1e00e58f422c6340, + ]), + Goldilocks::new_array([ + 0x21dd28d906a62dda, + 0xf32a46ab5f465b5f, + 0xbfce13201f3f7e6b, + 0xf30d2e7adb5304e2, + 0xecdf4ee4abad48e9, + 0xf94e82182d395019, + 0x4ee52e3744d887c5, + 0xa1341c7cac0083b2, + 0x2302fb26c30c834a, + 0xaea3c587273bf7d3, + 0xf798e24961823ec7, + 0x962deba3e9a2cd94, + ]), +]; + +/// Round constants for width-12 Poseidon2 on Goldilocks. +/// +/// Generated by the Grain LFSR with parameters: +/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 +/// +/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`. +/// +/// Layout: internal (22 scalar constants). +pub const GOLDILOCKS_POSEIDON2_RC_12_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([ + 0x4adf842aa75d4316, + 0xf8fbb871aa4ab4eb, + 0x68e85b6eb2dd6aeb, + 0x07a0b06b2d270380, + 0xd94e0228bd282de4, + 0x8bdd91d3250c5278, + 0x209c68b88bba778f, + 0xb5e18cdab77f3877, + 0xb296a3e808da93fa, + 0x8370ecbda11a327e, + 0x3f9075283775dad8, + 0xb78095bb23c6aa84, + 0x3f36b9fe72ad4e5f, + 0x69bc96780b10b553, + 0x3f1d341f2eb7b881, + 0x4e939e9815838818, + 0xda366b3ae2a31604, + 0xbc89db1e7287d509, + 0x6102f411f9ef5659, + 0x58725c5e7ac1f0ab, + 0x0df5856c798883e7, + 0xf7bb62a8da4c961b, +]); + +/// Create a default width-8 Poseidon2 permutation for Goldilocks. +#[cfg(not(target_arch = "aarch64"))] +pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> { + Poseidon2::new( + ExternalLayerConstants::new( + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(), + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(), + ), + GOLDILOCKS_POSEIDON2_RC_8_INTERNAL.to_vec(), + ) +} + +/// Create a default width-8 Poseidon2 permutation for Goldilocks. +#[cfg(target_arch = "aarch64")] +pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> { + crate::Poseidon2GoldilocksFused::new( + &ExternalLayerConstants::new( + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(), + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(), + ), + &GOLDILOCKS_POSEIDON2_RC_8_INTERNAL, + ) +} + +/// Create a default width-12 Poseidon2 permutation for Goldilocks. +#[cfg(not(target_arch = "aarch64"))] +pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> { + Poseidon2::new( + ExternalLayerConstants::new( + GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(), + GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(), + ), + GOLDILOCKS_POSEIDON2_RC_12_INTERNAL.to_vec(), + ) +} + +/// Create a default width-12 Poseidon2 permutation for Goldilocks. +#[cfg(target_arch = "aarch64")] +pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> { + crate::Poseidon2GoldilocksFused::new( + &ExternalLayerConstants::new( + GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(), + GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(), + ), + &GOLDILOCKS_POSEIDON2_RC_12_INTERNAL, + ) +} + +pub const MATRIX_DIAG_8_GOLDILOCKS: [Goldilocks; 8] = Goldilocks::new_array([ + 0xfffffffeffffffff, // -2 + 0x0000000000000001, // 1 + 0x0000000000000002, // 2 + 0x7fffffff80000001, // 1/2 + 0x0000000000000003, // 3 + 0x7fffffff80000000, // -1/2 + 0xfffffffefffffffe, // -3 + 0xfffffffefffffffd, // -4 +]); + +pub const MATRIX_DIAG_12_GOLDILOCKS: [Goldilocks; 12] = Goldilocks::new_array([ + 0xfffffffeffffffff, // -2 + 0x0000000000000001, // 1 + 0x0000000000000002, // 2 + 0x7fffffff80000001, // 1/2 + 0x0000000000000003, // 3 + 0x0000000000000004, // 4 + 0x7fffffff80000000, // -1/2 + 0xfffffffefffffffe, // -3 + 0xfffffffefffffffd, // -4 + 0xbfffffff40000001, // 1/2^2 + 0x3fffffffc0000000, // -1/2^2 + 0xdfffffff20000001, // 1/2^3 +]); + +pub const MATRIX_DIAG_16_GOLDILOCKS: [Goldilocks; 16] = Goldilocks::new_array([ + 0xfffffffeffffffff, // -2 + 0x0000000000000001, // 1 + 0x0000000000000002, // 2 + 0x7fffffff80000001, // 1/2 + 0x0000000000000003, // 3 + 0x0000000000000004, // 4 + 0x7fffffff80000000, // -1/2 + 0xfffffffefffffffe, // -3 + 0xfffffffefffffffd, // -4 + 0xdfffffff20000001, // 1/2^3 + 0xefffffff10000001, // 1/2^4 + 0xf7ffffff08000001, // 1/2^5 + 0x1fffffffe0000000, // -1/2^3 + 0x0ffffffff0000000, // -1/2^4 + 0x07fffffff8000000, // -1/2^5 + 0xfffffffe00000002, // 1/2^32 +]); + +pub const MATRIX_DIAG_20_GOLDILOCKS: [Goldilocks; 20] = Goldilocks::new_array([ + 0x95c381fda3b1fa57, + 0xf36fe9eb1288f42c, + 0x89f5dcdfef277944, + 0x106f22eadeb3e2d2, + 0x684e31a2530e5111, + 0x27435c5d89fd148e, + 0x3ebed31c414dbf17, + 0xfd45b0b2d294e3cc, + 0x48c904473a7f6dbf, + 0xe0d1b67809295b4d, + 0xddd1941e9d199dcb, + 0x8cfe534eeb742219, + 0xa6e5261d9e3b8524, + 0x6897ee5ed0f82c1b, + 0x0e7dcd0739ee5f78, + 0x493253f3d0d32363, + 0xbb2737f5845f05c0, + 0xa187e810b06ad903, + 0xb635b995936c4918, + 0x0b3694a940bd2394, +]); + +fn internal_layer_mat_mul_goldilocks_8>(state: &mut [A; 8]) { + let sum: A = state.iter().map(|r| r.dup()).sum(); + + let s0 = state[0].dup(); + let s1 = state[1].dup(); + let s2 = state[2].dup(); + let s3 = state[3].dup(); + let s4 = state[4].dup(); + let s5 = state[5].dup(); + let s6 = state[6].dup(); + let s7 = state[7].dup(); + + // V[0] = -2 + let two_s0 = s0.dup() + s0; + state[0] = sum.dup() - two_s0; + + // V[1] = 1 + state[1] = sum.dup() + s1; + + // V[2] = 2 + let two_s2 = s2.dup() + s2; + state[2] = sum.dup() + two_s2; + + // V[3] = 1/2 + state[3] = sum.dup() + s3.halve(); + + // V[4] = 3 + let two_s4 = s4.dup() + s4.dup(); + let three_s4 = two_s4 + s4; + state[4] = sum.dup() + three_s4; + + // V[5] = -1/2 + state[5] = sum.dup() - s5.halve(); + + // V[6] = -3 + let two_s6 = s6.dup() + s6.dup(); + let three_s6 = two_s6 + s6; + state[6] = sum.dup() - three_s6; + + // V[7] = -4 + let two_s7 = s7.dup() + s7; + let four_s7 = two_s7.dup() + two_s7; + state[7] = sum - four_s7; +} + +fn internal_layer_mat_mul_goldilocks_12>(state: &mut [A; 12]) { + let sum: A = state.iter().map(|r| r.dup()).sum(); + + let s0 = state[0].dup(); + let s1 = state[1].dup(); + let s2 = state[2].dup(); + let s3 = state[3].dup(); + let s4 = state[4].dup(); + let s5 = state[5].dup(); + let s6 = state[6].dup(); + let s7 = state[7].dup(); + let s8 = state[8].dup(); + let s9 = state[9].dup(); + let s10 = state[10].dup(); + let s11 = state[11].dup(); + + // V[0] = -2 + let two_s0 = s0.dup() + s0; + state[0] = sum.dup() - two_s0; + + // V[1] = 1 + state[1] = sum.dup() + s1; + + // V[2] = 2 + let two_s2 = s2.dup() + s2; + state[2] = sum.dup() + two_s2; + + // V[3] = 1/2 + state[3] = sum.dup() + s3.halve(); + + // V[4] = 3 + let two_s4 = s4.dup() + s4.dup(); + let three_s4 = two_s4 + s4; + state[4] = sum.dup() + three_s4; + + // V[5] = 4 + let two_s5 = s5.dup() + s5; + let four_s5 = two_s5.dup() + two_s5; + state[5] = sum.dup() + four_s5; + + // V[6] = -1/2 + state[6] = sum.dup() - s6.halve(); + + // V[7] = -3 + let two_s7 = s7.dup() + s7.dup(); + let three_s7 = two_s7 + s7; + state[7] = sum.dup() - three_s7; + + // V[8] = -4 + let two_s8 = s8.dup() + s8; + let four_s8 = two_s8.dup() + two_s8; + state[8] = sum.dup() - four_s8; + + // V[9] = 1/2^2 + state[9] = sum.dup() + s9.halve().halve(); + + // V[10] = -1/2^2 + state[10] = sum.dup() - s10.halve().halve(); + + // V[11] = 1/2^3 + state[11] = sum + s11.halve().halve().halve(); +} + +fn internal_layer_mat_mul_goldilocks_16>(state: &mut [A; 16]) { + let sum: A = state.iter().map(|r| r.dup()).sum(); + + let s0 = state[0].dup(); + let s1 = state[1].dup(); + let s2 = state[2].dup(); + let s3 = state[3].dup(); + let s4 = state[4].dup(); + let s5 = state[5].dup(); + let s6 = state[6].dup(); + let s7 = state[7].dup(); + let s8 = state[8].dup(); + let s9 = state[9].dup(); + let s10 = state[10].dup(); + let s11 = state[11].dup(); + let s12 = state[12].dup(); + let s13 = state[13].dup(); + let s14 = state[14].dup(); + let s15 = state[15].dup(); + + // V[0] = -2 + let two_s0 = s0.dup() + s0; + state[0] = sum.dup() - two_s0; + + // V[1] = 1 + state[1] = sum.dup() + s1; + + // V[2] = 2 + let two_s2 = s2.dup() + s2; + state[2] = sum.dup() + two_s2; + + // V[3] = 1/2 + state[3] = sum.dup() + s3.halve(); + + // V[4] = 3 + let two_s4 = s4.dup() + s4.dup(); + let three_s4 = two_s4 + s4; + state[4] = sum.dup() + three_s4; + + // V[5] = 4 + let two_s5 = s5.dup() + s5; + let four_s5 = two_s5.dup() + two_s5; + state[5] = sum.dup() + four_s5; + + // V[6] = -1/2 + state[6] = sum.dup() - s6.halve(); + + // V[7] = -3 + let two_s7 = s7.dup() + s7.dup(); + let three_s7 = two_s7 + s7; + state[7] = sum.dup() - three_s7; + + // V[8] = -4 + let two_s8 = s8.dup() + s8; + let four_s8 = two_s8.dup() + two_s8; + state[8] = sum.dup() - four_s8; + + // V[9] = 1/2^3 + state[9] = sum.dup() + s9.halve().halve().halve(); + + // V[10] = 1/2^4 + state[10] = sum.dup() + s10.halve().halve().halve().halve(); + + // V[11] = 1/2^5 + state[11] = sum.dup() + s11.halve().halve().halve().halve().halve(); + + // V[12] = -1/2^3 + state[12] = sum.dup() - s12.halve().halve().halve(); + + // V[13] = -1/2^4 + state[13] = sum.dup() - s13.halve().halve().halve().halve(); + + // V[14] = -1/2^5 + state[14] = sum.dup() - s14.halve().halve().halve().halve().halve(); + + // V[15] = 1/2^32 + let inv_2_32 = MATRIX_DIAG_16_GOLDILOCKS[15]; + let v15 = s15 * inv_2_32; + state[15] = sum + v15; +} + +/// The internal layers of the Poseidon2 permutation. +#[derive(Debug, Clone, Default)] +pub struct Poseidon2InternalLayerGoldilocks { + internal_constants: Vec, +} + +impl InternalLayerConstructor for Poseidon2InternalLayerGoldilocks { + fn new_from_constants(internal_constants: Vec) -> Self { + Self { internal_constants } + } +} + +impl + InjectiveMonomial> + InternalLayer for Poseidon2InternalLayerGoldilocks +{ + /// Perform the internal layers of the Poseidon2 permutation on the given state. + fn permute_state(&self, state: &mut [A; 8]) { + internal_permute_state( + state, + internal_layer_mat_mul_goldilocks_8, + &self.internal_constants, + ); + } +} + +impl + InjectiveMonomial> + InternalLayer for Poseidon2InternalLayerGoldilocks +{ + /// Perform the internal layers of the Poseidon2 permutation on the given state. + fn permute_state(&self, state: &mut [A; 12]) { + internal_permute_state( + state, + internal_layer_mat_mul_goldilocks_12, + &self.internal_constants, + ); + } +} + +impl + InjectiveMonomial> + InternalLayer for Poseidon2InternalLayerGoldilocks +{ + /// Perform the internal layers of the Poseidon2 permutation on the given state. + fn permute_state(&self, state: &mut [A; 16]) { + internal_permute_state( + state, + internal_layer_mat_mul_goldilocks_16, + &self.internal_constants, + ); + } +} + +impl + InjectiveMonomial> + InternalLayer for Poseidon2InternalLayerGoldilocks +{ + /// Perform the internal layers of the Poseidon2 permutation on the given state. + fn permute_state(&self, state: &mut [A; 20]) { + internal_permute_state( + state, + |x| matmul_internal(x, MATRIX_DIAG_20_GOLDILOCKS), + &self.internal_constants, + ); + } +} + +/// The external layers of the Poseidon2 permutation. +#[derive(Clone)] +pub struct Poseidon2ExternalLayerGoldilocks { + pub(crate) external_constants: ExternalLayerConstants, +} + +impl ExternalLayerConstructor + for Poseidon2ExternalLayerGoldilocks +{ + fn new_from_constants(external_constants: ExternalLayerConstants) -> Self { + Self { external_constants } + } +} + +impl + InjectiveMonomial, const WIDTH: usize> + ExternalLayer for Poseidon2ExternalLayerGoldilocks +{ + /// Perform the initial external layers of the Poseidon2 permutation on the given state. + fn permute_state_initial(&self, state: &mut [A; WIDTH]) { + external_initial_permute_state( + state, + self.external_constants.get_initial_constants(), + add_rc_and_sbox_generic, + &MDSMat4, + ); + } + + /// Perform the terminal external layers of the Poseidon2 permutation on the given state. + fn permute_state_terminal(&self, state: &mut [A; WIDTH]) { + external_terminal_permute_state( + state, + self.external_constants.get_terminal_constants(), + add_rc_and_sbox_generic, + &MDSMat4, + ); + } +} + +/// An implementation of the matrix multiplications in the internal and external layers of Poseidon2. +/// +/// This can act on `[A; WIDTH]` for any ring implementing `Algebra`. +/// If you have either `[Goldilocks::Packing; WIDTH]` or `[Goldilocks; WIDTH]` it will be much faster +/// to use `Poseidon2Goldilocks` instead of building a Poseidon2 permutation using this. +#[derive(Clone, Debug, Default)] +pub struct GenericPoseidon2LinearLayersGoldilocks; + +impl GenericPoseidon2LinearLayers<8> for GenericPoseidon2LinearLayersGoldilocks { + fn internal_linear_layer(state: &mut [R; 8]) { + let sum: R = state.iter().map(|r| r.dup()).sum(); + for i in 0..8 { + let d = R::from_u64(MATRIX_DIAG_8_GOLDILOCKS[i].value); + state[i] *= d; + state[i] += sum.dup(); + } + } +} + +impl GenericPoseidon2LinearLayers<12> for GenericPoseidon2LinearLayersGoldilocks { + fn internal_linear_layer(state: &mut [R; 12]) { + let sum: R = state.iter().map(|r| r.dup()).sum(); + for i in 0..12 { + let d = R::from_u64(MATRIX_DIAG_12_GOLDILOCKS[i].value); + state[i] *= d; + state[i] += sum.dup(); + } + } +} + +impl GenericPoseidon2LinearLayers<16> for GenericPoseidon2LinearLayersGoldilocks { + fn internal_linear_layer(state: &mut [R; 16]) { + let sum: R = state.iter().map(|r| r.dup()).sum(); + for i in 0..16 { + let d = R::from_u64(MATRIX_DIAG_16_GOLDILOCKS[i].value); + state[i] *= d; + state[i] += sum.dup(); + } + } +} + +impl GenericPoseidon2LinearLayers<20> for GenericPoseidon2LinearLayersGoldilocks { + fn internal_linear_layer(state: &mut [R; 20]) { + let sum: R = state.iter().map(|r| r.dup()).sum(); + for i in 0..20 { + let d = R::from_u64(MATRIX_DIAG_20_GOLDILOCKS[i].value); + state[i] *= d; + state[i] += sum.dup(); + } + } +} + +#[cfg(test)] +mod tests { + use p3_field::PrimeCharacteristicRing; + use p3_symmetric::Permutation; + + use super::*; + + type F = Goldilocks; + + #[test] + fn test_generic_internal_linear_layer_8_matches_matmul_internal() { + let mut state_generic = [ + F::from_u64(1), + F::from_u64(2), + F::from_u64(3), + F::from_u64(4), + F::from_u64(5), + F::from_u64(6), + F::from_u64(7), + F::from_u64(8), + ]; + let mut state_existing = state_generic; + + GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); + matmul_internal(&mut state_existing, MATRIX_DIAG_8_GOLDILOCKS); + + assert_eq!(state_generic, state_existing); + } + + #[test] + fn test_generic_internal_linear_layer_12_matches_matmul_internal() { + let mut state_generic = [ + F::from_u64(1), + F::from_u64(2), + F::from_u64(3), + F::from_u64(4), + F::from_u64(5), + F::from_u64(6), + F::from_u64(7), + F::from_u64(8), + F::from_u64(9), + F::from_u64(10), + F::from_u64(11), + F::from_u64(12), + ]; + let mut state_existing = state_generic; + + GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); + matmul_internal(&mut state_existing, MATRIX_DIAG_12_GOLDILOCKS); + + assert_eq!(state_generic, state_existing); + } + + #[test] + fn test_generic_internal_linear_layer_16_matches_matmul_internal() { + let mut state_generic = [ + F::from_u64(1), + F::from_u64(2), + F::from_u64(3), + F::from_u64(4), + F::from_u64(5), + F::from_u64(6), + F::from_u64(7), + F::from_u64(8), + F::from_u64(9), + F::from_u64(10), + F::from_u64(11), + F::from_u64(12), + F::from_u64(13), + F::from_u64(14), + F::from_u64(15), + F::from_u64(16), + ]; + let mut state_existing = state_generic; + + GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); + matmul_internal(&mut state_existing, MATRIX_DIAG_16_GOLDILOCKS); + + assert_eq!(state_generic, state_existing); + } + + #[test] + fn test_generic_internal_linear_layer_20_matches_matmul_internal() { + let mut state_generic = [ + F::from_u64(1), + F::from_u64(2), + F::from_u64(3), + F::from_u64(4), + F::from_u64(5), + F::from_u64(6), + F::from_u64(7), + F::from_u64(8), + F::from_u64(9), + F::from_u64(10), + F::from_u64(11), + F::from_u64(12), + F::from_u64(13), + F::from_u64(14), + F::from_u64(15), + F::from_u64(16), + F::from_u64(17), + F::from_u64(18), + F::from_u64(19), + F::from_u64(20), + ]; + let mut state_existing = state_generic; + + GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); + matmul_internal(&mut state_existing, MATRIX_DIAG_20_GOLDILOCKS); + + assert_eq!(state_generic, state_existing); + } + + #[test] + fn test_default_goldilocks_poseidon2_width_8() { + let mut input: [F; 8] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7]); + + let expected: [F; 8] = Goldilocks::new_array([ + 0x020cf04a1b214d14, + 0x84e14aaaeacaed25, + 0x1ae0f640e81c7457, + 0xa4d204cbaeb0d8a5, + 0x0cf637b627b3a7ff, + 0x788d304d948b486b, + 0x7327133ea1949af4, + 0xf415abb924da395b, + ]); + + let perm = default_goldilocks_poseidon2_8(); + perm.permute_mut(&mut input); + + assert_eq!(input, expected); + } + + #[test] + fn test_default_goldilocks_poseidon2_width_12() { + let mut input: [F; 12] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); + + let expected: [F; 12] = Goldilocks::new_array([ + 0xf292ab67c0f14b03, + 0x0a32f1b37656544c, + 0x053c61ab895498de, + 0x02ff92e55b196ffb, + 0x58176e8f6f58cab2, + 0xb0aa1206e7aec0f8, + 0xe90c13f3dce83ca4, + 0xf4da15333edf39c2, + 0x23b701c053c2ca6c, + 0xd233d593dcdfbf58, + 0x4effa5f9516fb52e, + 0x0aaf4489f1f40166, + ]); + + let perm = default_goldilocks_poseidon2_12(); + perm.permute_mut(&mut input); + + assert_eq!(input, expected); + } +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs new file mode 100644 index 000000000..44fe4fa3f --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs @@ -0,0 +1,86 @@ +use p3_mds::MdsPermutation; +use p3_mds::util::apply_circulant; +use p3_symmetric::Permutation; + +use crate::x86_64_avx2::packing::PackedGoldilocksAVX2; +use crate::{ + MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW, + MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks, +}; +const fn convert_array(arr: [i64; N]) -> [u64; N] { + let mut result: [u64; N] = [0; N]; + let mut i = 0; + while i < N { + result[i] = arr[i] as u64; + i += 1; + } + result +} + +impl Permutation<[PackedGoldilocksAVX2; 8]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX2; 8]) -> [PackedGoldilocksAVX2; 8] { + const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW); + apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +impl Permutation<[PackedGoldilocksAVX2; 12]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX2; 12]) -> [PackedGoldilocksAVX2; 12] { + const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW); + apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +impl Permutation<[PackedGoldilocksAVX2; 16]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX2; 16]) -> [PackedGoldilocksAVX2; 16] { + const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW); + apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +impl Permutation<[PackedGoldilocksAVX2; 24]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX2; 24]) -> [PackedGoldilocksAVX2; 24] { + apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +#[cfg(test)] +mod tests { + use p3_symmetric::Permutation; + use rand::rngs::SmallRng; + use rand::{RngExt, SeedableRng}; + + use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX2}; + + macro_rules! test_avx2_mds { + ($name:ident, $width:literal) => { + #[test] + fn $name() { + let mut rng = SmallRng::seed_from_u64(1); + let mds = MdsMatrixGoldilocks; + + let input: [Goldilocks; $width] = rng.random(); + let expected = mds.permute(input); + + let packed_input = input.map(Into::::into); + let packed_output = mds.permute(packed_input); + + let avx2_output = packed_output.map(|x| x.0[0]); + assert_eq!(avx2_output, expected); + } + }; + } + + test_avx2_mds!(test_avx2_mds_width_8, 8); + test_avx2_mds!(test_avx2_mds_width_12, 12); + test_avx2_mds!(test_avx2_mds_width_16, 16); + test_avx2_mds!(test_avx2_mds_width_24, 24); +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs new file mode 100644 index 000000000..09300a20f --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs @@ -0,0 +1,3 @@ +mod mds; +mod packing; +pub use packing::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs new file mode 100644 index 000000000..217a2b2e0 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs @@ -0,0 +1,539 @@ +use alloc::vec::Vec; +use core::arch::x86_64::*; +use core::fmt::Debug; +use core::iter::{Product, Sum}; +use core::mem::transmute; +use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; + +use p3_field::exponentiation::exp_10540996611094048183; +use p3_field::interleave::{interleave_u64, interleave_u128}; +use p3_field::op_assign_macros::{ + impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, + impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, + ring_sum, +}; +use p3_field::{ + Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, + PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2, +}; +use p3_util::reconstitute_from_base; +use rand::distr::{Distribution, StandardUniform}; +use rand::{Rng, RngExt}; + +use crate::{Goldilocks, P}; + +const WIDTH: usize = 4; + +/// Vectorized AVX2 implementation of `Goldilocks` arithmetic. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +#[repr(transparent)] // Needed to make `transmute`s safe. +#[must_use] +pub struct PackedGoldilocksAVX2(pub [Goldilocks; WIDTH]); + +impl PackedGoldilocksAVX2 { + /// Get an arch-specific vector representing the packed values. + #[inline] + #[must_use] + pub(crate) fn to_vector(self) -> __m256i { + unsafe { + // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It + // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be + // transmuted to `__m256i`, since arrays are guaranteed to be contiguous in memory. + // Finally `PackedGoldilocksAVX2` is `repr(transparent)` so it can be transmuted to + // `[Goldilocks; WIDTH]`. + transmute(self) + } + } + + /// Make a packed field vector from an arch-specific vector. + /// + /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function + /// is safe unlike the `Mersenne31/MontyField31` variants. + #[inline] + pub(crate) fn from_vector(vector: __m256i) -> Self { + unsafe { + // Safety: `__m256i` can be transmuted to `[u64; WIDTH]` (since arrays elements are + // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since + // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to + // `PackedGoldilocksAVX2` (since `PackedGoldilocksAVX2` is also `repr(transparent)`). + transmute(vector) + } + } + + /// Copy `value` to all positions in a packed vector. This is the same as + /// `From::from`, but `const`. + #[inline] + const fn broadcast(value: Goldilocks) -> Self { + Self([value; WIDTH]) + } +} + +impl From for PackedGoldilocksAVX2 { + fn from(x: Goldilocks) -> Self { + Self::broadcast(x) + } +} + +impl Add for PackedGoldilocksAVX2 { + type Output = Self; + #[inline] + fn add(self, rhs: Self) -> Self { + Self::from_vector(add(self.to_vector(), rhs.to_vector())) + } +} + +impl Sub for PackedGoldilocksAVX2 { + type Output = Self; + #[inline] + fn sub(self, rhs: Self) -> Self { + Self::from_vector(sub(self.to_vector(), rhs.to_vector())) + } +} + +impl Neg for PackedGoldilocksAVX2 { + type Output = Self; + #[inline] + fn neg(self) -> Self { + Self::from_vector(neg(self.to_vector())) + } +} + +impl Mul for PackedGoldilocksAVX2 { + type Output = Self; + #[inline] + fn mul(self, rhs: Self) -> Self { + Self::from_vector(mul(self.to_vector(), rhs.to_vector())) + } +} + +impl_add_assign!(PackedGoldilocksAVX2); +impl_sub_assign!(PackedGoldilocksAVX2); +impl_mul_methods!(PackedGoldilocksAVX2); +ring_sum!(PackedGoldilocksAVX2); +impl_rng!(PackedGoldilocksAVX2); + +impl PrimeCharacteristicRing for PackedGoldilocksAVX2 { + type PrimeSubfield = Goldilocks; + + const ZERO: Self = Self::broadcast(Goldilocks::ZERO); + const ONE: Self = Self::broadcast(Goldilocks::ONE); + const TWO: Self = Self::broadcast(Goldilocks::TWO); + const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE); + + #[inline] + fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { + f.into() + } + + #[inline] + fn halve(&self) -> Self { + Self::from_vector(halve(self.to_vector())) + } + + #[inline] + fn square(&self) -> Self { + Self::from_vector(square(self.to_vector())) + } + + #[inline] + fn zero_vec(len: usize) -> Vec { + // SAFETY: this is a repr(transparent) wrapper around an array. + unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) } + } +} + +// Degree of the smallest permutation polynomial for Goldilocks. +// +// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7. +impl InjectiveMonomial<7> for PackedGoldilocksAVX2 {} + +impl PermutationMonomial<7> for PackedGoldilocksAVX2 { + /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}. + /// + /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`. + fn injective_exp_root_n(&self) -> Self { + exp_10540996611094048183(*self) + } +} + +impl_add_base_field!(PackedGoldilocksAVX2, Goldilocks); +impl_sub_base_field!(PackedGoldilocksAVX2, Goldilocks); +impl_mul_base_field!(PackedGoldilocksAVX2, Goldilocks); +impl_div_methods!(PackedGoldilocksAVX2, Goldilocks); +impl_sum_prod_base_field!(PackedGoldilocksAVX2, Goldilocks); + +impl Algebra for PackedGoldilocksAVX2 { + // Benchmarked on AVX2: chunk=32 ≈ 226ns, chunk=2 ≈ 228ns, chunk=16 ≈ 229ns. + const BATCHED_LC_CHUNK: usize = 32; +} + +impl_packed_value!(PackedGoldilocksAVX2, Goldilocks, WIDTH); + +unsafe impl PackedField for PackedGoldilocksAVX2 { + type Scalar = Goldilocks; +} + +impl_packed_field_pow_2!( + PackedGoldilocksAVX2; + [ + (1, interleave_u64), + (2, interleave_u128), + ], + WIDTH +); + +// Resources: +// 1. Intel Intrinsics Guide for explanation of each intrinsic: +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/ +// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html +// 3. Intel optimization manual for introduction to x86 vector extensions and best practices: +// https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html + +// Preliminary knowledge: +// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with +// _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a +// vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper. +// +// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily +// emulated. The method recognizes that for a + b overflowed iff (a + b) < a: +// i. res_lo = a_lo + b_lo +// ii. carry_mask = res_lo < a_lo +// iii. res_hi = a_hi + b_hi - carry_mask +// Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions +// return -1 (all bits 1) for true and 0 for false. +// +// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons +// by recognizing that a __m256i { + unsafe { _mm256_xor_si256(x, SIGN_BIT) } +} + +/// Convert to canonical representation. +/// The argument is assumed to be shifted by 1 << 63 (i.e. x_s = x + 1<<63, where x is the field +/// value). The returned value is similarly shifted by 1 << 63 (i.e. we return y_s = y + (1<<63), +/// where 0 <= y < FIELD_ORDER). +#[inline] +unsafe fn canonicalize_s(x_s: __m256i) -> __m256i { + unsafe { + // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1. + let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s); + // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0. + let wrapback_amt = _mm256_andnot_si256(mask, EPSILON); + _mm256_add_epi64(x_s, wrapback_amt) + } +} + +/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is +/// pre-shifted by 1 << 63. The result is similarly shifted. +#[inline] +unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i { + unsafe { + let res_wrapped_s = _mm256_add_epi64(x, y_s); + let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. + _mm256_add_epi64(res_wrapped_s, wrapback_amt) + } +} + +/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`. +/// +/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn add(x: __m256i, y: __m256i) -> __m256i { + unsafe { + let y_s = shift(y); + let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s)); + shift(res_s) + } +} + +/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`. +/// +/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn sub(x: __m256i, y: __m256i) -> __m256i { + unsafe { + let mut y_s = shift(y); + y_s = canonicalize_s(y_s); + let x_s = shift(x); + let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0. + let res_wrapped = _mm256_sub_epi64(x_s, y_s); + _mm256_sub_epi64(res_wrapped, wrapback_amt) + } +} + +/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`. +/// +/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn neg(y: __m256i) -> __m256i { + unsafe { + let y_s = shift(y); + _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s)) + } +} + +/// Halve a vector of Goldilocks field elements. +#[inline(always)] +pub(crate) fn halve(input: __m256i) -> __m256i { + /* + We want this to compile to: + vpand least_bit, val, ONE + vpsrlq t, val, 1 + vpsubq neg_least_bit, ZERO, least_bit + vpand maybe_half, HALF, neg_least_bit + vpaddq res, t, maybe_half + throughput: 1.67 cyc/vec + latency: 4 cyc + + Given an element val in [0, P), we want to compute val/2 mod P. + If val is even: val/2 mod P = val/2 = val >> 1. + If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2 + */ + unsafe { + // Safety: If this code got compiled then AVX2 intrinsics are available. + const ONE: __m256i = unsafe { transmute([1_i64; 4]) }; + const ZERO: __m256i = unsafe { transmute([0_i64; 4]) }; + let half = _mm256_set1_epi64x(P.div_ceil(2) as i64); // Compiler should realise this is constant. + + let least_bit = _mm256_and_si256(input, ONE); // Determine the parity of val. + let t = _mm256_srli_epi64::<1>(input); + + // Negate the least bit giving us either 0 (all bits 0) or -1 (all bits 1). + // It would be better to use vpsignq but this instruction does not exist. + let neg_least_bit = _mm256_sub_epi64(ZERO, least_bit); + + // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0 + let maybe_half = _mm256_and_si256(half, neg_least_bit); + _mm256_add_epi64(t, maybe_half) + } +} + +/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the +/// scalar instruction, but may be worth it if we want our data to live in vector registers. +#[inline] +fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) { + unsafe { + // We want to move the high 32 bits to the low position. The multiplication instruction ignores + // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can + // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication. + // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the + // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. + // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency + // since Haswell. + let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); + let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y))); + + // All four pairwise multiplications + let mul_ll = _mm256_mul_epu32(x, y); + let mul_lh = _mm256_mul_epu32(x, y_hi); + let mul_hl = _mm256_mul_epu32(x_hi, y); + let mul_hh = _mm256_mul_epu32(x_hi, y_hi); + + // Bignum addition + // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. + let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll); + let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi); + // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. + // Also, extract high 32 bits of t0 and add to mul_hh. + let t0_lo = _mm256_and_si256(t0, EPSILON); + let t0_hi = _mm256_srli_epi64::<32>(t0); + let t1 = _mm256_add_epi64(mul_lh, t0_lo); + let t2 = _mm256_add_epi64(mul_hh, t0_hi); + // Lastly, extract the high 32 bits of t1 and add to t2. + let t1_hi = _mm256_srli_epi64::<32>(t1); + let res_hi = _mm256_add_epi64(t2, t1_hi); + + // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high + // position). + let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1))); + let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo); + + (res_hi, res_lo) + } +} + +/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction. +#[inline] +fn square64(x: __m256i) -> (__m256i, __m256i) { + unsafe { + // Get high 32 bits of x. See comment in mul64_64_s. + let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); + + // All pairwise multiplications. + let mul_ll = _mm256_mul_epu32(x, x); + let mul_lh = _mm256_mul_epu32(x, x_hi); + let mul_hh = _mm256_mul_epu32(x_hi, x_hi); + + // Bignum addition, but mul_lh is shifted by 33 bits (not 32). + let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll); + let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi); + let t0_hi = _mm256_srli_epi64::<31>(t0); + let res_hi = _mm256_add_epi64(mul_hh, t0_hi); + + // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high + // position). + let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh); + let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo); + + (res_hi, res_lo) + } +} + +/// Goldilocks addition of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be +/// `<= 2^64 - 2^32 = 0xffffffff00000000`. The result is shifted by 2**63. +#[inline] +unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { + unsafe { + let res_wrapped_s = _mm256_add_epi64(x_s, y); + // 32-bit compare is faster than 64-bit. It's safe as long as x > res_wrapped iff x >> 32 > + // res_wrapped >> 32. The case of x >> 32 > res_wrapped >> 32 is trivial and so is <. The case + // where x >> 32 = res_wrapped >> 32 remains. If x >> 32 = res_wrapped >> 32, then y >> 32 = + // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y + // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur. + let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0. + // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. + _mm256_add_epi64(res_wrapped_s, wrapback_amt) + } +} + +/// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be +/// <= `0xffffffff00000000`. The result is shifted by 2**63. +#[inline] +unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { + unsafe { + let res_wrapped_s = _mm256_sub_epi64(x_s, y); + // 32-bit compare is faster than 64-bit. It's safe as long as res_wrapped > x iff res_wrapped >> + // 32 > x >> 32. The case of res_wrapped >> 32 > x >> 32 is trivial and so is <. The case where + // res_wrapped >> 32 = x >> 32 remains. If res_wrapped >> 32 = x >> 32, then y >> 32 = + // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if + // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur. + let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0. + // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0. + _mm256_sub_epi64(res_wrapped_s, wrapback_amt) + } +} + +/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order. +/// +/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`. +#[inline] +fn reduce128(x: (__m256i, __m256i)) -> __m256i { + unsafe { + let (hi0, lo0) = x; + + // First we shift lo0 to lo0_s = lo0 + 2^{63} mod 2^64 + // This lets us emulate unsigned comparisons + let lo0_s = shift(lo0); + + // Get the top 32 bits of hi_hi0. + let hi_hi0 = _mm256_srli_epi64::<32>(hi0); + + // Computes lo0_s - hi_hi0 mod FIELD_ORDER. + // Makes sense to do as 2^96 = -1 mod FIELD_ORDER. + // sub_small_64s_64_s is safe to use as `hi_hi0 < 2^32`. + let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0); + + // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER + // _mm256_mul_epu32 ignores the top 32 bits so just use that. + let t1 = _mm256_mul_epu32(hi0, EPSILON); + + // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 so we can use `add_small_64s_64_s` to get + // `lo2_s = lo1_s + t1 mod FIELD_ORDER.` + let lo2_s = add_small_64s_64_s(lo1_s, t1); + + // Finally just need to correct for the shift. + shift(lo2_s) + } +} + +/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`. +/// +/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn mul(x: __m256i, y: __m256i) -> __m256i { + reduce128(mul64_64(x, y)) +} + +/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`. +/// +/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn square(x: __m256i) -> __m256i { + reduce128(square64(x)) +} + +#[cfg(test)] +mod tests { + use p3_field_testing::test_packed_field; + + use super::{Goldilocks, PackedGoldilocksAVX2, WIDTH}; + + const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([ + 0xFFFF_FFFF_0000_0000, + 0xFFFF_FFFF_FFFF_FFFF, + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0001, + ]); + + const ZEROS: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([ + 0x0000_0000_0000_0000, + 0xFFFF_FFFF_0000_0001, + 0x0000_0000_0000_0000, + 0xFFFF_FFFF_0000_0001, + ])); + + const ONES: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([ + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0002, + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0002, + ])); + + test_packed_field!( + crate::PackedGoldilocksAVX2, + &[super::ZEROS], + &[super::ONES], + crate::PackedGoldilocksAVX2(super::SPECIAL_VALS) + ); +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs new file mode 100644 index 000000000..f4d6c9f71 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs @@ -0,0 +1,86 @@ +use p3_mds::MdsPermutation; +use p3_mds::util::apply_circulant; +use p3_symmetric::Permutation; + +use crate::x86_64_avx512::packing::PackedGoldilocksAVX512; +use crate::{ + MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW, + MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks, +}; +const fn convert_array(arr: [i64; N]) -> [u64; N] { + let mut result: [u64; N] = [0; N]; + let mut i = 0; + while i < N { + result[i] = arr[i] as u64; + i += 1; + } + result +} + +impl Permutation<[PackedGoldilocksAVX512; 8]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX512; 8]) -> [PackedGoldilocksAVX512; 8] { + const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW); + apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +impl Permutation<[PackedGoldilocksAVX512; 12]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX512; 12]) -> [PackedGoldilocksAVX512; 12] { + const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW); + apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +impl Permutation<[PackedGoldilocksAVX512; 16]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX512; 16]) -> [PackedGoldilocksAVX512; 16] { + const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW); + apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +impl Permutation<[PackedGoldilocksAVX512; 24]> for MdsMatrixGoldilocks { + fn permute(&self, input: [PackedGoldilocksAVX512; 24]) -> [PackedGoldilocksAVX512; 24] { + apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input) + } +} + +impl MdsPermutation for MdsMatrixGoldilocks {} + +#[cfg(test)] +mod tests { + use p3_symmetric::Permutation; + use rand::rngs::SmallRng; + use rand::{RngExt, SeedableRng}; + + use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX512}; + + macro_rules! test_avx512_mds { + ($name:ident, $width:literal) => { + #[test] + fn $name() { + let mut rng = SmallRng::seed_from_u64(1); + let mds = MdsMatrixGoldilocks; + + let input: [Goldilocks; $width] = rng.random(); + let expected = mds.permute(input); + + let packed_input = input.map(Into::::into); + let packed_output = mds.permute(packed_input); + + let avx512_output = packed_output.map(|x| x.0[0]); + assert_eq!(avx512_output, expected); + } + }; + } + + test_avx512_mds!(test_avx512_mds_width_8, 8); + test_avx512_mds!(test_avx512_mds_width_12, 12); + test_avx512_mds!(test_avx512_mds_width_16, 16); + test_avx512_mds!(test_avx512_mds_width_24, 24); +} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs new file mode 100644 index 000000000..09300a20f --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs @@ -0,0 +1,3 @@ +mod mds; +mod packing; +pub use packing::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs new file mode 100644 index 000000000..0c751b436 --- /dev/null +++ b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs @@ -0,0 +1,444 @@ +use alloc::vec::Vec; +use core::arch::x86_64::*; +use core::fmt::Debug; +use core::iter::{Product, Sum}; +use core::mem::transmute; +use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; + +use p3_field::exponentiation::exp_10540996611094048183; +use p3_field::interleave::{interleave_u64, interleave_u128, interleave_u256}; +use p3_field::op_assign_macros::{ + impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, + impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, + ring_sum, +}; +use p3_field::{ + Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, + PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2, +}; +use p3_util::reconstitute_from_base; +use rand::distr::{Distribution, StandardUniform}; +use rand::{Rng, RngExt}; + +use crate::{Goldilocks, P}; + +const WIDTH: usize = 8; + +/// Vectorized AVX512 implementation of `Goldilocks` arithmetic. +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +#[repr(transparent)] // Needed to make `transmute`s safe. +#[must_use] +pub struct PackedGoldilocksAVX512(pub [Goldilocks; WIDTH]); + +impl PackedGoldilocksAVX512 { + /// Get an arch-specific vector representing the packed values. + #[inline] + #[must_use] + pub(crate) fn to_vector(self) -> __m512i { + unsafe { + // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It + // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be + // transmuted to `__m512i`, since arrays are guaranteed to be contiguous in memory. + // Finally `PackedGoldilocksAVX512` is `repr(transparent)` so it can be transmuted to + // `[Goldilocks; WIDTH]`. + transmute(self) + } + } + + /// Make a packed field vector from an arch-specific vector. + /// + /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function + /// is safe unlike the `Mersenne31/MontyField31` variants. + #[inline] + pub(crate) fn from_vector(vector: __m512i) -> Self { + unsafe { + // Safety: `__m512i` can be transmuted to `[u64; WIDTH]` (since arrays elements are + // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since + // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to + // `PackedGoldilocksAVX512` (since `PackedGoldilocksAVX512` is also `repr(transparent)`). + transmute(vector) + } + } + + /// Copy `value` to all positions in a packed vector. This is the same as + /// `From::from`, but `const`. + #[inline] + const fn broadcast(value: Goldilocks) -> Self { + Self([value; WIDTH]) + } +} + +impl From for PackedGoldilocksAVX512 { + fn from(x: Goldilocks) -> Self { + Self::broadcast(x) + } +} + +impl Add for PackedGoldilocksAVX512 { + type Output = Self; + #[inline] + fn add(self, rhs: Self) -> Self { + Self::from_vector(add(self.to_vector(), rhs.to_vector())) + } +} + +impl Sub for PackedGoldilocksAVX512 { + type Output = Self; + #[inline] + fn sub(self, rhs: Self) -> Self { + Self::from_vector(sub(self.to_vector(), rhs.to_vector())) + } +} + +impl Neg for PackedGoldilocksAVX512 { + type Output = Self; + #[inline] + fn neg(self) -> Self { + Self::from_vector(neg(self.to_vector())) + } +} + +impl Mul for PackedGoldilocksAVX512 { + type Output = Self; + #[inline] + fn mul(self, rhs: Self) -> Self { + Self::from_vector(mul(self.to_vector(), rhs.to_vector())) + } +} + +impl_add_assign!(PackedGoldilocksAVX512); +impl_sub_assign!(PackedGoldilocksAVX512); +impl_mul_methods!(PackedGoldilocksAVX512); +ring_sum!(PackedGoldilocksAVX512); +impl_rng!(PackedGoldilocksAVX512); + +impl PrimeCharacteristicRing for PackedGoldilocksAVX512 { + type PrimeSubfield = Goldilocks; + + const ZERO: Self = Self::broadcast(Goldilocks::ZERO); + const ONE: Self = Self::broadcast(Goldilocks::ONE); + const TWO: Self = Self::broadcast(Goldilocks::TWO); + const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE); + + #[inline] + fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { + f.into() + } + + #[inline] + fn halve(&self) -> Self { + Self::from_vector(halve(self.to_vector())) + } + + #[inline] + fn square(&self) -> Self { + Self::from_vector(square(self.to_vector())) + } + + #[inline] + fn zero_vec(len: usize) -> Vec { + // SAFETY: this is a repr(transparent) wrapper around an array. + unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) } + } +} + +impl_add_base_field!(PackedGoldilocksAVX512, Goldilocks); +impl_sub_base_field!(PackedGoldilocksAVX512, Goldilocks); +impl_mul_base_field!(PackedGoldilocksAVX512, Goldilocks); +impl_div_methods!(PackedGoldilocksAVX512, Goldilocks); +impl_sum_prod_base_field!(PackedGoldilocksAVX512, Goldilocks); + +impl Algebra for PackedGoldilocksAVX512 { + // Benchmarked on AVX-512: chunk=4 ≈ 198ns, chunk=2 ≈ 198ns, chunk=32 ≈ 199ns. + const BATCHED_LC_CHUNK: usize = 4; +} + +// Degree of the smallest permutation polynomial for Goldilocks. +// +// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7. +impl InjectiveMonomial<7> for PackedGoldilocksAVX512 {} + +impl PermutationMonomial<7> for PackedGoldilocksAVX512 { + /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}. + /// + /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`. + fn injective_exp_root_n(&self) -> Self { + exp_10540996611094048183(*self) + } +} + +impl_packed_value!(PackedGoldilocksAVX512, Goldilocks, WIDTH); + +unsafe impl PackedField for PackedGoldilocksAVX512 { + type Scalar = Goldilocks; +} + +impl_packed_field_pow_2!( + PackedGoldilocksAVX512; + [ + (1, interleave_u64), + (2, interleave_u128), + (4, interleave_u256), + ], + WIDTH +); + +const FIELD_ORDER: __m512i = unsafe { transmute([Goldilocks::ORDER_U64; WIDTH]) }; +const EPSILON: __m512i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) }; + +#[inline] +unsafe fn canonicalize(x: __m512i) -> __m512i { + unsafe { + let mask = _mm512_cmpge_epu64_mask(x, FIELD_ORDER); + _mm512_mask_sub_epi64(x, mask, x, FIELD_ORDER) + } +} + +/// Compute the modular addition `x + y mod FIELD_ORDER`. +/// +/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider +/// set of circumstances if bounds on `x` are known. +/// +/// The result will be a u64 which may be greater than FIELD_ORDER. +/// +/// Safety: +/// User must ensure that x + y < 2^64 + FIELD_ORDER. +#[inline] +unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i { + unsafe { + let res_wrapped = _mm512_add_epi64(x, y); + let mask = _mm512_cmplt_epu64_mask(res_wrapped, y); // mask set if add overflowed + _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER) + } +} + +/// Compute the modular subtraction x - y mod FIELD_ORDER. +/// +/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider +/// set of circumstances if bounds on `x` are known. +/// +/// The result will be a u64 which may be greater than FIELD_ORDER. +/// +/// Safety: +/// User must ensure that x - y > -FIELD_ORDER. +#[inline] +unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i { + unsafe { + let mask = _mm512_cmplt_epu64_mask(x, y); // mask set if sub will underflow (x < y) + let res_wrapped = _mm512_sub_epi64(x, y); + _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER) + } +} + +/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`. +/// +/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn add(x: __m512i, y: __m512i) -> __m512i { + unsafe { add_no_double_overflow_64_64(x, canonicalize(y)) } +} + +/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`. +/// +/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn sub(x: __m512i, y: __m512i) -> __m512i { + unsafe { sub_no_double_overflow_64_64(x, canonicalize(y)) } +} + +/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`. +/// +/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn neg(y: __m512i) -> __m512i { + unsafe { _mm512_sub_epi64(FIELD_ORDER, canonicalize(y)) } +} + +/// Halve a vector of Goldilocks field elements. +#[inline(always)] +pub(crate) fn halve(input: __m512i) -> __m512i { + /* + We want this to compile to: + vptestmq least_bit, val, ONE + vpsrlq res, val, 1 + vpaddq res{least_bit}, res, maybe_half + throughput: 2 cyc/vec + latency: 4 cyc + + Given an element val in [0, P), we want to compute val/2 mod P. + If val is even: val/2 mod P = val/2 = val >> 1. + If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2 + */ + unsafe { + // Safety: If this code got compiled then AVX512 intrinsics are available. + const ONE: __m512i = unsafe { transmute([1_i64; 8]) }; + let half = _mm512_set1_epi64(P.div_ceil(2) as i64); // Compiler realises this is constant. + + let least_bit = _mm512_test_epi64_mask(input, ONE); // Determine the parity of val. + let t = _mm512_srli_epi64::<1>(input); + // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0 + _mm512_mask_add_epi64(t, least_bit, t, half) + } +} + +#[allow(clippy::useless_transmute)] +const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) }; + +/// Full 64-bit by 64-bit multiplication. +#[inline] +fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) { + unsafe { + // We want to move the high 32 bits to the low position. The multiplication instruction ignores + // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can + // be done on port 5; bitshifts run on port 0, competing with multiplication. + // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the + // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. + // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency + // since Haswell. + let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x))); + let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y))); + + // All four pairwise multiplications + let mul_ll = _mm512_mul_epu32(x, y); + let mul_lh = _mm512_mul_epu32(x, y_hi); + let mul_hl = _mm512_mul_epu32(x_hi, y); + let mul_hh = _mm512_mul_epu32(x_hi, y_hi); + + // Bignum addition + // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. + let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll); + let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi); + // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. + // Also, extract high 32 bits of t0 and add to mul_hh. + let t0_lo = _mm512_and_si512(t0, EPSILON); + let t0_hi = _mm512_srli_epi64::<32>(t0); + let t1 = _mm512_add_epi64(mul_lh, t0_lo); + let t2 = _mm512_add_epi64(mul_hh, t0_hi); + // Lastly, extract the high 32 bits of t1 and add to t2. + let t1_hi = _mm512_srli_epi64::<32>(t1); + let res_hi = _mm512_add_epi64(t2, t1_hi); + + // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high + // position). + let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1))); + let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll); + + (res_hi, res_lo) + } +} + +/// Full 64-bit squaring. +#[inline] +fn square64(x: __m512i) -> (__m512i, __m512i) { + unsafe { + // Get high 32 bits of x. See comment in mul64_64_s. + let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x))); + + // All pairwise multiplications. + let mul_ll = _mm512_mul_epu32(x, x); + let mul_lh = _mm512_mul_epu32(x, x_hi); + let mul_hh = _mm512_mul_epu32(x_hi, x_hi); + + // Bignum addition, but mul_lh is shifted by 33 bits (not 32). + let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll); + let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi); + let t0_hi = _mm512_srli_epi64::<31>(t0); + let res_hi = _mm512_add_epi64(mul_hh, t0_hi); + + // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high + // position). + let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh); + let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo); + + (res_hi, res_lo) + } +} + +/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order. +/// +/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`. +#[inline] +fn reduce128(x: (__m512i, __m512i)) -> __m512i { + unsafe { + let (hi0, lo0) = x; + + // Find the high 32 bits of hi0. + let hi_hi0 = _mm512_srli_epi64::<32>(hi0); + + // Computes lo0_s - hi_hi0 mod FIELD_ORDER. + // Makes sense to do as 2^96 = -1 mod FIELD_ORDER. + // `sub_no_double_overflow_64_64` is safe to use as `hi_hi0 < 2^32`. + let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0); + + // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER + // _mm256_mul_epu32 ignores the top 32 bits so just use that. + let t1 = _mm512_mul_epu32(hi0, EPSILON); + + // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 < FIELD_ORDER so we can use `add_no_double_overflow_64_64` to get + // `lo1 + t1 mod FIELD_ORDER.` + add_no_double_overflow_64_64(lo1, t1) + } +} + +/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`. +/// +/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn mul(x: __m512i, y: __m512i) -> __m512i { + reduce128(mul64_64(x, y)) +} + +/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`. +/// +/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. +#[inline] +fn square(x: __m512i) -> __m512i { + reduce128(square64(x)) +} + +#[cfg(test)] +mod tests { + use p3_field_testing::test_packed_field; + + use super::{Goldilocks, PackedGoldilocksAVX512, WIDTH}; + + const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([ + 0xFFFF_FFFF_0000_0001, + 0xFFFF_FFFF_0000_0000, + 0xFFFF_FFFE_FFFF_FFFF, + 0xFFFF_FFFF_FFFF_FFFF, + 0x0000_0000_0000_0000, + 0x0000_0000_0000_0001, + 0x0000_0000_0000_0002, + 0x0FFF_FFFF_F000_0000, + ]); + + const ZEROS: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([ + 0x0000_0000_0000_0000, + 0xFFFF_FFFF_0000_0001, + 0x0000_0000_0000_0000, + 0xFFFF_FFFF_0000_0001, + 0x0000_0000_0000_0000, + 0xFFFF_FFFF_0000_0001, + 0x0000_0000_0000_0000, + 0xFFFF_FFFF_0000_0001, + ])); + + const ONES: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([ + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0002, + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0002, + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0002, + 0x0000_0000_0000_0001, + 0xFFFF_FFFF_0000_0002, + ])); + + test_packed_field!( + crate::PackedGoldilocksAVX512, + &[super::ZEROS], + &[super::ONES], + crate::PackedGoldilocksAVX512(super::SPECIAL_VALS) + ); +} diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh new file mode 100755 index 000000000..01e3a5306 --- /dev/null +++ b/bench_vs_plonky3/run.sh @@ -0,0 +1,410 @@ +#!/bin/bash +# Benchmark: Lambda STARK vs Plonky3 — single-shot prove time on the shared +# Fibonacci AIR (columns = 2 * num_sequences, blowup = 2, fri_queries = 219). +# +# Usage: +# ./bench_vs_plonky3/run.sh [--log-rows K ...] [--num-sequences N] [--runs N] +# [--lambda-only | --p3-only] [--report-dir DIR] +# [--no-p3-patch] [--scalar] [--no-color] +# +# Defaults: --log-rows 19, --num-sequences 16, --runs 3. +# With multiple --log-rows values, prints one median row per size. +# +# --scalar: disables SIMD at the target-feature level. On x86_64 drops AVX2 +# and AVX-512 (Goldilocks + most of Keccak go scalar, residual SSE2 in +# p3-keccak). On aarch64 drops the SHA3 NEON extension. Triggers a rebuild +# when toggling; subsequent runs with the same RUSTFLAGS are cached. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="/tmp/bench_p3" +REPORT_DIR="" +NO_COLOR=false +NO_P3_PATCH=false +SCALAR=false + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BOLD='\033[1m' +NC='\033[0m' + +LOG_ROWS=() +NUM_SEQUENCES=16 +RUNS=3 +RUN_LAMBDA=true +RUN_P3=true + +# --- Parse args ------------------------------------------------------------- +while [[ $# -gt 0 ]]; do + case $1 in + --log-rows) + shift + while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do + LOG_ROWS+=("$1") + shift + done + ;; + --num-sequences) + if [[ $# -lt 2 ]]; then echo "--num-sequences requires an argument"; exit 1; fi + NUM_SEQUENCES=$2 + shift 2 + ;; + --runs) + if [[ $# -lt 2 ]]; then echo "--runs requires an argument"; exit 1; fi + RUNS=$2 + shift 2 + ;; + --lambda-only) + RUN_P3=false + shift + ;; + --p3-only) + RUN_LAMBDA=false + shift + ;; + --report-dir) + if [[ $# -lt 2 ]]; then echo "--report-dir requires an argument"; exit 1; fi + REPORT_DIR=$2 + shift 2 + ;; + --no-p3-patch) + NO_P3_PATCH=true + shift + ;; + --scalar) + SCALAR=true + shift + ;; + --no-color) + NO_COLOR=true + shift + ;; + -h|--help) + sed -n '2,11p' "$0" | sed 's/^# //' + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [ ${#LOG_ROWS[@]} -eq 0 ]; then + LOG_ROWS=(19) +fi + +if ! $RUN_LAMBDA && ! $RUN_P3; then + echo "At least one prover must be enabled" + exit 1 +fi + +if [ "$RUNS" -lt 1 ]; then + echo "--runs must be >= 1" + exit 1 +fi + +if $NO_COLOR; then + RED='' + GREEN='' + YELLOW='' + BOLD='' + NC='' +fi + +mkdir -p "$TMP_DIR" +rm -rf "$TMP_DIR"/* + +if [ -n "$REPORT_DIR" ]; then + mkdir -p "$REPORT_DIR/raw" +fi + +# --- Patch toggle ----------------------------------------------------------- +# The root Cargo.toml has a [patch.crates-io] block pointing at the vendored +# p3-goldilocks-patched (adds BinomiallyExtendable<3>, disables NEON). For the +# nightly we build against vanilla crates.io p3-goldilocks — we comment the +# block out and drop the `p3-degree3` feature. +CARGO_TOML="$ROOT_DIR/Cargo.toml" +CARGO_TOML_BAK="" +BUILD_FEATURE_FLAGS=() +if $NO_P3_PATCH; then + CARGO_TOML_BAK="$CARGO_TOML.bak.p3bench.$$" + cp "$CARGO_TOML" "$CARGO_TOML_BAK" + # Comment the [patch.crates-io] block and its entries (until the next blank + # line or next [section]). + python3 - "$CARGO_TOML" <<'PY' +import sys, pathlib +path = pathlib.Path(sys.argv[1]) +lines = path.read_text().splitlines(keepends=True) +out = [] +in_patch = False +for ln in lines: + stripped = ln.strip() + if stripped == "[patch.crates-io]": + in_patch = True + out.append("# " + ln if not ln.startswith("#") else ln) + continue + if in_patch: + if stripped.startswith("[") and stripped.endswith("]"): + in_patch = False + out.append(ln) + continue + if stripped == "": + in_patch = False + out.append(ln) + continue + out.append("# " + ln if not ln.startswith("#") else ln) + else: + out.append(ln) +path.write_text("".join(out)) +PY + trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi' EXIT INT TERM + BUILD_FEATURE_FLAGS=(--no-default-features --features parallel) +fi + +# --- Scalar (no SIMD) toggle ------------------------------------------------ +# When --scalar is on, disable vector instruction sets for the build so both +# provers run against the same scalar baseline. p3-keccak keeps SSE2 residual +# on x86 — acceptable per the bench workstream (contribution is ~7%). +# x86_64 → -avx2,-avx512f (Goldilocks + most of Keccak go scalar) +# aarch64 → -sha3 (drops Keccak NEON SHA3 extension) +# Cargo caches per-RUSTFLAGS, so toggling scalar vs vector triggers a rebuild +# on first use but is cached afterwards. +SCALAR_RUSTFLAGS="" +if $SCALAR; then + case "$(uname -m)" in + x86_64|amd64) + SCALAR_RUSTFLAGS="-C target-feature=-avx2,-avx512f" + ;; + arm64|aarch64) + SCALAR_RUSTFLAGS="-C target-feature=-sha3" + ;; + *) + echo "warning: --scalar: unknown arch $(uname -m); not pinning RUSTFLAGS" >&2 + ;; + esac + if [ -n "$SCALAR_RUSTFLAGS" ]; then + if [ -n "${RUSTFLAGS:-}" ]; then + export RUSTFLAGS="${RUSTFLAGS} ${SCALAR_RUSTFLAGS}" + else + export RUSTFLAGS="$SCALAR_RUSTFLAGS" + fi + fi +fi + +# --- Build ------------------------------------------------------------------ +echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}" +echo -e " log-rows: ${YELLOW}${LOG_ROWS[*]}${NC}" +echo -e " num-sequences: ${YELLOW}${NUM_SEQUENCES}${NC} (columns = $((2 * NUM_SEQUENCES)))" +echo -e " runs/size: ${YELLOW}${RUNS}${NC} (median reported)" +if $NO_P3_PATCH; then + echo -e " p3 extension: ${YELLOW}degree 2 (vanilla, no patch)${NC}" +else + echo -e " p3 extension: ${YELLOW}degree 3 (patched, matches Lambda)${NC}" +fi +if $SCALAR; then + echo -e " scalar mode: ${YELLOW}on${NC} (arch=$(uname -m), RUSTFLAGS=\"${RUSTFLAGS:-}\")" +else + echo -e " scalar mode: ${YELLOW}off${NC} (SIMD enabled, compiler default)" +fi +echo "" + +echo -e "${GREEN}[build]${NC} prove_bench" +# Use the `${arr[@]+...}` expansion so `set -u` doesn't blow up when the +# feature-flag array is empty (bash 3 on macOS). +cargo build --release -p bench-vs-plonky3 --bin prove_bench \ + --manifest-path "$ROOT_DIR/Cargo.toml" \ + ${BUILD_FEATURE_FLAGS[@]+"${BUILD_FEATURE_FLAGS[@]}"} 2>&1 | tail -5 + +BIN="$ROOT_DIR/target/release/prove_bench" +if [ ! -x "$BIN" ]; then + echo -e "${RED}[build] prove_bench not produced at $BIN${NC}" + exit 1 +fi + +# --- Helpers ---------------------------------------------------------------- +extract_proving_time() { + sed -nE '/Proving time: [0-9.]+s/ { + s/.*Proving time: ([0-9.]+)s.*/\1/ + p + q + }' +} + +median_of() { + # prints median of the given numeric arguments (rounded to 3 decimals). + # Uses shell `sort -g` for portability (macOS awk lacks gawk's asort). + printf '%s\n' "$@" | LC_ALL=C sort -g | LC_NUMERIC=C awk ' + { a[NR] = $0 + 0 } + END { + if (NR == 0) { print "n/a"; exit } + if (NR % 2 == 1) { + printf "%.3f\n", a[(NR + 1) / 2] + } else { + printf "%.3f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2 + } + }' +} + +ratio_fmt() { + LC_NUMERIC=C awk -v num="$1" -v den="$2" 'BEGIN { + if (den + 0 == 0) { print "n/a"; exit } + printf "%.3f\n", num / den + }' +} + +# --- Run benchmark ---------------------------------------------------------- + +RESULT_LOG_ROWS=() +RESULT_ROWS=() +RESULT_LAMBDA=() +RESULT_P3=() +RESULT_RATIO=() + +run_prover() { + local prover=$1 # lambda | p3 + local log_rows=$2 + local times=() + for run_i in $(seq 1 "$RUNS"); do + local out_file="$TMP_DIR/${prover}_${log_rows}_${run_i}.stdout" + if ! "$BIN" --prover "$prover" \ + --log-rows "$log_rows" \ + --num-sequences "$NUM_SEQUENCES" > "$out_file" 2>&1; then + echo -e " ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}" + cat "$out_file" + exit 1 + fi + local t + t=$(extract_proving_time < "$out_file") + if [ -z "$t" ]; then + echo -e " ${RED}[${prover}] could not parse proving time (log-rows=${log_rows}, run ${run_i})${NC}" + cat "$out_file" + exit 1 + fi + times+=("$t") + if [ -n "$REPORT_DIR" ]; then + cp "$out_file" "$REPORT_DIR/raw/${prover}_log${log_rows}_run${run_i}.stdout" + fi + done + median_of "${times[@]}" + printf '%s\n' "${times[@]}" > "$TMP_DIR/${prover}_${log_rows}.times" +} + +for lr in "${LOG_ROWS[@]}"; do + rows=$((1 << lr)) + echo -e "${BOLD}--- log-rows=${lr} (rows = ${rows}) ---${NC}" + + lambda_median="n/a" + p3_median="n/a" + + if $RUN_LAMBDA; then + echo -ne " ${GREEN}[lambda]${NC} " + lambda_median=$(run_prover lambda "$lr") + echo "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")" + fi + + if $RUN_P3; then + echo -ne " ${GREEN}[p3]${NC} " + p3_median=$(run_prover p3 "$lr") + echo "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")" + fi + + local_ratio="n/a" + if $RUN_LAMBDA && $RUN_P3; then + local_ratio=$(ratio_fmt "$lambda_median" "$p3_median") + fi + + RESULT_LOG_ROWS+=("$lr") + RESULT_ROWS+=("$rows") + RESULT_LAMBDA+=("$lambda_median") + RESULT_P3+=("$p3_median") + RESULT_RATIO+=("$local_ratio") +done + +# --- Summary table ---------------------------------------------------------- + +echo "" +echo -e "${BOLD}=== Summary ===${NC}" +if $RUN_LAMBDA && $RUN_P3; then + printf " %-9s %-12s %14s %14s %10s\n" "log-rows" "rows" "Lambda (s)" "P3 (s)" "L/P3" + printf " %-9s %-12s %14s %14s %10s\n" "--------" "----" "----------" "------" "----" +else + printf " %-9s %-12s %14s\n" "log-rows" "rows" "Time (s)" + printf " %-9s %-12s %14s\n" "--------" "----" "--------" +fi + +for i in "${!RESULT_LOG_ROWS[@]}"; do + lr="${RESULT_LOG_ROWS[$i]}" + rows="${RESULT_ROWS[$i]}" + lt="${RESULT_LAMBDA[$i]}" + pt="${RESULT_P3[$i]}" + rt="${RESULT_RATIO[$i]}" + if $RUN_LAMBDA && $RUN_P3; then + color=$GREEN + if awk -v l="$lt" -v p="$pt" 'BEGIN{ exit !(l+0 > p+0) }'; then + color=$RED + fi + printf " %-9s %-12s %13ss %13ss ${color}%9sx${NC}\n" \ + "$lr" "$rows" "$lt" "$pt" "$rt" + elif $RUN_LAMBDA; then + printf " %-9s %-12s %13ss\n" "$lr" "$rows" "$lt" + else + printf " %-9s %-12s %13ss\n" "$lr" "$rows" "$pt" + fi +done + +echo "" +if $RUN_LAMBDA && $RUN_P3; then + echo -e "Timing window: single-shot end-to-end prove. Ratio < 1 → Lambda faster." +fi +if $NO_P3_PATCH; then + echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2." + echo -e " Lambda keeps degree-3 — extension fields differ across sides." +fi + +# --- Machine-readable report ------------------------------------------------ + +if [ -n "$REPORT_DIR" ]; then + { + printf "log_rows\trows\tlambda_median_s\tp3_median_s\tratio_lambda_over_p3\truns\n" + for i in "${!RESULT_LOG_ROWS[@]}"; do + printf "%s\t%s\t%s\t%s\t%s\t%s\n" \ + "${RESULT_LOG_ROWS[$i]}" \ + "${RESULT_ROWS[$i]}" \ + "${RESULT_LAMBDA[$i]}" \ + "${RESULT_P3[$i]}" \ + "${RESULT_RATIO[$i]}" \ + "$RUNS" + done + } > "$REPORT_DIR/results.tsv" + + { + echo "# Lambda STARK vs Plonky3 Benchmark" + echo + echo "Timing window: \`single-shot end-to-end prove\` (no verification)." + echo "num-sequences: \`$NUM_SEQUENCES\`, columns: \`$((2 * NUM_SEQUENCES))\`, blowup: 2, fri_queries: 219, grinding: 0." + echo "runs per size: \`$RUNS\` (median reported)." + echo "arch: \`$(uname -m)\`, scalar mode: \`$($SCALAR && echo on || echo off)\`." + if $SCALAR && [ -n "$SCALAR_RUSTFLAGS" ]; then + echo "RUSTFLAGS: \`$SCALAR_RUSTFLAGS\`." + fi + if $NO_P3_PATCH; then + echo + echo "> Plonky3 built without the vendored degree-3 patch: Challenge type is degree-2 (vanilla crates.io p3-goldilocks 0.5.2). Lambda still uses degree 3." + fi + echo + echo "| log-rows | rows | Lambda (s) | P3 (s) | Lambda / P3 |" + echo "|---------:|-----:|-----------:|-------:|------------:|" + for i in "${!RESULT_LOG_ROWS[@]}"; do + printf "| %s | %s | %s | %s | %s |\n" \ + "${RESULT_LOG_ROWS[$i]}" \ + "${RESULT_ROWS[$i]}" \ + "${RESULT_LAMBDA[$i]}" \ + "${RESULT_P3[$i]}" \ + "${RESULT_RATIO[$i]}" + done + } > "$REPORT_DIR/summary.md" +fi diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs new file mode 100644 index 000000000..cb58aea42 --- /dev/null +++ b/bench_vs_plonky3/src/bin/prove_bench.rs @@ -0,0 +1,185 @@ +//! Minimal wall-clock benchmark harness for Lambda STARK vs Plonky3. +//! +//! Builds the same Fibonacci AIR as `instruments_breakdown` (but without any +//! instrumentation) and prints a single line `Proving time: X.XXXs` to +//! stdout, suitable for parsing by `bench_vs_plonky3/run.sh`. +//! +//! Usage: +//! prove_bench --prover {lambda|p3} [--log-rows K] [--num-sequences N] +//! [--blowup B] [--queries Q] [--grinding G] +//! +//! Defaults match production (`GoldilocksCubicProofOptions::with_blowup(2)`): +//! log-rows=19, num-sequences=16, blowup=2, queries=219, grinding=0. + +use std::process::ExitCode; +use std::time::Instant; + +use bench_vs_plonky3::{lambda_fibonacci_pair, plonky3_config, plonky3_fibonacci}; +use crypto::fiat_shamir::default_transcript::DefaultTranscript; +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use stark::proof::options::ProofOptions; +use stark::prover::{IsStarkProver, Prover}; + +type F = GoldilocksField; +type E = Degree3GoldilocksExtensionField; +type FE = FieldElement; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ProverKind { + Lambda, + P3, +} + +struct Args { + prover: ProverKind, + log_rows: u32, + num_sequences: usize, + blowup: u8, + queries: usize, + grinding: u8, +} + +impl Default for Args { + fn default() -> Self { + Self { + prover: ProverKind::Lambda, + log_rows: 19, + num_sequences: 16, + blowup: 2, + queries: 219, + grinding: 0, + } + } +} + +fn print_usage() { + eprintln!( + "usage: prove_bench --prover {{lambda|p3}} \ + [--log-rows K] [--num-sequences N] \ + [--blowup B] [--queries Q] [--grinding G]" + ); +} + +fn parse_args() -> Result { + let mut args = Args::default(); + let mut prover_set = false; + let mut iter = std::env::args().skip(1); + while let Some(a) = iter.next() { + match a.as_str() { + "--prover" => { + let v = iter.next().ok_or("--prover needs a value")?; + args.prover = match v.as_str() { + "lambda" => ProverKind::Lambda, + "p3" => ProverKind::P3, + other => return Err(format!("unknown prover: {other}")), + }; + prover_set = true; + } + "--log-rows" => { + let v = iter.next().ok_or("--log-rows needs a value")?; + args.log_rows = v.parse().map_err(|_| "--log-rows: invalid u32")?; + } + "--num-sequences" => { + let v = iter.next().ok_or("--num-sequences needs a value")?; + args.num_sequences = v.parse().map_err(|_| "--num-sequences: invalid usize")?; + } + "--blowup" => { + let v = iter.next().ok_or("--blowup needs a value")?; + args.blowup = v.parse().map_err(|_| "--blowup: invalid u8")?; + } + "--queries" => { + let v = iter.next().ok_or("--queries needs a value")?; + args.queries = v.parse().map_err(|_| "--queries: invalid usize")?; + } + "--grinding" => { + let v = iter.next().ok_or("--grinding needs a value")?; + args.grinding = v.parse().map_err(|_| "--grinding: invalid u8")?; + } + "-h" | "--help" => { + print_usage(); + std::process::exit(0); + } + other => return Err(format!("unknown arg: {other}")), + } + } + if !prover_set { + return Err("--prover is required".into()); + } + if args.log_rows < 2 || args.log_rows > 30 { + return Err("--log-rows must be in [2, 30]".into()); + } + if args.num_sequences == 0 { + return Err("--num-sequences must be > 0".into()); + } + Ok(args) +} + +fn proof_options(args: &Args) -> ProofOptions { + ProofOptions { + blowup_factor: args.blowup, + fri_number_of_queries: args.queries, + coset_offset: 3, + grinding_factor: args.grinding, + } +} + +fn run_lambda(args: &Args) -> std::time::Duration { + let rows = 1usize << args.log_rows; + let options = proof_options(args); + + let initial_values: Vec<(FE, FE)> = (0..args.num_sequences) + .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) + .collect(); + + let mut trace = lambda_fibonacci_pair::compute_trace::(&initial_values, rows); + let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values); + let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( + &options, + args.num_sequences, + ); + + let start = Instant::now(); + let _proof = Prover::::prove( + &air, + &mut trace, + &pub_inputs, + &mut DefaultTranscript::::new(&[]), + ) + .expect("lambda prove failed"); + start.elapsed() +} + +fn run_p3(args: &Args) -> std::time::Duration { + let rows = 1usize << args.log_rows; + let config = plonky3_config::matched_params_config(); + let air = plonky3_fibonacci::P3FibonacciAir { + num_sequences: args.num_sequences, + }; + let trace = plonky3_fibonacci::generate_fibonacci_trace(args.num_sequences, rows); + let pis = plonky3_fibonacci::public_values(args.num_sequences); + + let start = Instant::now(); + let _proof = p3_uni_stark::prove(&config, &air, trace, &pis); + start.elapsed() +} + +fn main() -> ExitCode { + let args = match parse_args() { + Ok(a) => a, + Err(e) => { + eprintln!("error: {e}"); + print_usage(); + return ExitCode::from(2); + } + }; + + let elapsed = match args.prover { + ProverKind::Lambda => run_lambda(&args), + ProverKind::P3 => run_p3(&args), + }; + + println!("Proving time: {:.3}s", elapsed.as_secs_f64()); + ExitCode::SUCCESS +} diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs new file mode 100644 index 000000000..2f1fd4990 --- /dev/null +++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs @@ -0,0 +1,326 @@ +//! Lambda AIR matching Plonky3's `P3FibonacciAir` exactly in shape. +//! +//! Each sequence uses 2 columns (`left`, `right`) with a 2-row transition +//! window, packing 2 Fibonacci steps per row: +//! +//! `local.left = x_{2i}` +//! `local.right = x_{2i+1}` +//! `next.left = x_{2i+2} = local.left + local.right` +//! `next.right = x_{2i+3} = local.right + next.left` +//! +//! For `num_sequences` sequences: +//! - columns = `2 * num_sequences` +//! - transition constraints = `2 * num_sequences` +//! - boundary constraints = `2 * num_sequences` (pin `(a, b)` at row 0) +//! +//! This matches `P3FibonacciAir` cell-by-cell; only the prover internals +//! (multi_prove vs uni-stark, degree-3 vs degree-2 extension) differ. + +use std::marker::PhantomData; + +use math::field::{ + element::FieldElement, + traits::{IsFFTField, IsField, IsSubFieldOf}, +}; +use stark::{ + constraints::{ + boundary::{BoundaryConstraint, BoundaryConstraints}, + transition::TransitionConstraint, + }, + context::AirContext, + proof::options::ProofOptions, + trace::TraceTable, + traits::{AIR, TransitionEvaluationContext}, +}; + +/// `next.left = local.left + local.right` (advances 2 Fibonacci steps) +#[derive(Clone)] +pub struct FibPairShiftConstraint +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + seq_idx: usize, + constraint_idx: usize, + phantom_f: PhantomData, + phantom_e: PhantomData, +} + +impl FibPairShiftConstraint +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + pub fn new(seq_idx: usize, constraint_idx: usize) -> Self { + Self { + seq_idx, + constraint_idx, + phantom_f: PhantomData, + phantom_e: PhantomData, + } + } +} + +impl TransitionConstraint for FibPairShiftConstraint +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + fn degree(&self) -> usize { + 1 + } + + fn constraint_idx(&self) -> usize { + self.constraint_idx + } + + fn end_exemptions(&self) -> usize { + 1 + } + + fn evaluate( + &self, + eval_ctx: &TransitionEvaluationContext, + out: &mut [FieldElement], + ) { + match eval_ctx { + TransitionEvaluationContext::Prover { frame, .. } => { + let s0 = frame.get_evaluation_step(0); + let s1 = frame.get_evaluation_step(1); + let local_left = s0.get_main_evaluation_element(0, 2 * self.seq_idx); + let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx); + let res = next_left - local_left - local_right; + out[self.constraint_idx] = res.to_extension(); + } + TransitionEvaluationContext::Verifier { frame, .. } => { + let s0 = frame.get_evaluation_step(0); + let s1 = frame.get_evaluation_step(1); + let local_left = s0.get_main_evaluation_element(0, 2 * self.seq_idx); + let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx); + let res = next_left - local_left - local_right; + out[self.constraint_idx] = res; + } + } + } +} + +/// `next.right = local.right + next.left` +#[derive(Clone)] +pub struct FibPairSumConstraint +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + seq_idx: usize, + constraint_idx: usize, + phantom_f: PhantomData, + phantom_e: PhantomData, +} + +impl FibPairSumConstraint +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + pub fn new(seq_idx: usize, constraint_idx: usize) -> Self { + Self { + seq_idx, + constraint_idx, + phantom_f: PhantomData, + phantom_e: PhantomData, + } + } +} + +impl TransitionConstraint for FibPairSumConstraint +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + fn degree(&self) -> usize { + 1 + } + + fn constraint_idx(&self) -> usize { + self.constraint_idx + } + + fn end_exemptions(&self) -> usize { + 1 + } + + fn evaluate( + &self, + eval_ctx: &TransitionEvaluationContext, + out: &mut [FieldElement], + ) { + match eval_ctx { + TransitionEvaluationContext::Prover { frame, .. } => { + let s0 = frame.get_evaluation_step(0); + let s1 = frame.get_evaluation_step(1); + let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx); + let next_right = s1.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let res = next_right - local_right - next_left; + out[self.constraint_idx] = res.to_extension(); + } + TransitionEvaluationContext::Verifier { frame, .. } => { + let s0 = frame.get_evaluation_step(0); + let s1 = frame.get_evaluation_step(1); + let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx); + let next_right = s1.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let res = next_right - local_right - next_left; + out[self.constraint_idx] = res; + } + } + } +} + +/// Public inputs: initial `(a, b) = (left, right)` pair for each sequence. +#[derive(Clone, Debug)] +pub struct FibonacciPairPublicInputs { + pub initial_values: Vec<(FieldElement, FieldElement)>, +} + +/// Multi-sequence Fibonacci AIR with 2-row window, matching Plonky3's `P3FibonacciAir`. +pub struct FibonacciPairMultiColAIR +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + context: AirContext, + constraints: Vec>>, + num_sequences: usize, +} + +impl AIR for FibonacciPairMultiColAIR +where + F: IsSubFieldOf + IsFFTField + Send + Sync + 'static, + E: IsField + Send + Sync + 'static, +{ + type Field = F; + type FieldExtension = E; + type PublicInputs = FibonacciPairPublicInputs; + + fn step_size(&self) -> usize { + 1 + } + + fn new(proof_options: &ProofOptions) -> Self { + Self::with_num_sequences(proof_options, 2) + } + + fn composition_poly_degree_bound(&self, trace_length: usize) -> usize { + trace_length + } + + fn transition_constraints(&self) -> &Vec>> { + &self.constraints + } + + fn boundary_constraints( + &self, + pub_inputs: &Self::PublicInputs, + _rap_challenges: &[FieldElement], + _bus_public_inputs: Option<&stark::lookup::BusPublicInputs>, + _trace_length: usize, + ) -> BoundaryConstraints { + let mut constraints = Vec::with_capacity(2 * pub_inputs.initial_values.len()); + for (seq_idx, (a, b)) in pub_inputs.initial_values.iter().enumerate() { + constraints.push(BoundaryConstraint::new_main( + 2 * seq_idx, + 0, + a.clone().to_extension(), + )); + constraints.push(BoundaryConstraint::new_main( + 2 * seq_idx + 1, + 0, + b.clone().to_extension(), + )); + } + BoundaryConstraints::from_constraints(constraints) + } + + fn context(&self) -> &AirContext { + &self.context + } + + fn trace_layout(&self) -> (usize, usize) { + (2 * self.num_sequences, 0) + } +} + +impl FibonacciPairMultiColAIR +where + F: IsSubFieldOf + IsFFTField + Send + Sync + 'static, + E: IsField + Send + Sync + 'static, +{ + pub fn with_num_sequences(proof_options: &ProofOptions, num_sequences: usize) -> Self { + let mut constraints: Vec>> = + Vec::with_capacity(2 * num_sequences); + for seq in 0..num_sequences { + constraints.push(Box::new(FibPairShiftConstraint::new(seq, 2 * seq))); + constraints.push(Box::new(FibPairSumConstraint::new(seq, 2 * seq + 1))); + } + + let context = AirContext { + proof_options: proof_options.clone(), + trace_columns: 2 * num_sequences, + transition_offsets: vec![0, 1], + num_transition_constraints: 2 * num_sequences, + }; + + Self { + context, + constraints, + num_sequences, + } + } +} + +/// Computes the packed Fibonacci trace. +/// +/// Each row holds `(x_{2i}, x_{2i+1})` for each sequence. Identical values to +/// `plonky3_fibonacci::generate_fibonacci_trace` at the same coordinates. +pub fn compute_trace( + initial_values: &[(FieldElement, FieldElement)], + trace_length: usize, +) -> TraceTable +where + F: IsSubFieldOf + IsFFTField + Send + Sync, + E: IsField + Send + Sync, +{ + let num_sequences = initial_values.len(); + let mut columns: Vec>> = Vec::with_capacity(2 * num_sequences); + + for (a, b) in initial_values { + let mut left_col = Vec::with_capacity(trace_length); + let mut right_col = Vec::with_capacity(trace_length); + + let mut left = a.clone(); + let mut right = b.clone(); + + for _ in 0..trace_length { + left_col.push(left.clone()); + right_col.push(right.clone()); + let new_left = left.clone() + right.clone(); + let new_right = right.clone() + new_left.clone(); + left = new_left; + right = new_right; + } + + columns.push(left_col); + columns.push(right_col); + } + + TraceTable::from_columns_main(columns, 1) +} + +pub fn create_public_inputs( + initial_values: Vec<(FieldElement, FieldElement)>, +) -> FibonacciPairPublicInputs { + FibonacciPairPublicInputs { initial_values } +} diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs new file mode 100644 index 000000000..224ad5fa9 --- /dev/null +++ b/bench_vs_plonky3/src/lib.rs @@ -0,0 +1,341 @@ +pub mod lambda_fibonacci_pair; +pub mod plonky3_config; +pub mod plonky3_fibonacci; + +#[cfg(test)] +mod tests { + use super::*; + + use crypto::fiat_shamir::default_transcript::DefaultTranscript; + use math::field::element::FieldElement; + use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; + use math::field::goldilocks::GoldilocksField; + use p3_field::PrimeField64; + use p3_uni_stark::{prove, verify}; + use stark::proof::options::ProofOptions; + use stark::prover::{IsStarkProver, Prover}; + use stark::verifier::{IsStarkVerifier, Verifier}; + + type F = GoldilocksField; + type E = Degree3GoldilocksExtensionField; + type FE = FieldElement; + + fn benchmark_proof_options() -> ProofOptions { + ProofOptions { + blowup_factor: 2, + fri_number_of_queries: 219, + coset_offset: 3, + grinding_factor: 0, + } + } + + #[test] + fn lambda_fibonacci_pair_prove_verify() { + let num_sequences = 2; + let trace_length = 128; // 2^7 + let proof_options = benchmark_proof_options(); + + let initial_values: Vec<(FE, FE)> = (0..num_sequences) + .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) + .collect(); + + let mut trace = + lambda_fibonacci_pair::compute_trace::(&initial_values, trace_length); + let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values); + let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( + &proof_options, + num_sequences, + ); + + let proof = Prover::::prove( + &air, + &mut trace, + &pub_inputs, + &mut DefaultTranscript::::new(&[]), + ) + .unwrap(); + + assert!(Verifier::::verify( + &proof, + &air, + &mut DefaultTranscript::::new(&[]), + )); + } + + #[test] + fn plonky3_fibonacci_prove_verify() { + let num_sequences = 2; + let rows = 128; // 2^7 + + let config = plonky3_config::matched_params_config(); + let air = plonky3_fibonacci::P3FibonacciAir { num_sequences }; + let trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows); + let pis = plonky3_fibonacci::public_values(num_sequences); + + let proof = prove(&config, &air, trace, &pis); + verify(&config, &air, &proof, &pis).expect("Plonky3 verification failed"); + } + + /// Lambda prove with instruments breakdown + P3 span-based breakdown. + /// Run: cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture + #[test] + fn instruments_breakdown() { + let num_sequences = 16; + let rows = 1 << 18; + let proof_options = benchmark_proof_options(); + + let initial_values: Vec<(FE, FE)> = (0..num_sequences) + .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) + .collect(); + + let mut trace = + lambda_fibonacci_pair::compute_trace::(&initial_values, rows); + let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values); + let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( + &proof_options, + num_sequences, + ); + + let start = std::time::Instant::now(); + let _proof = Prover::::prove( + &air, + &mut trace, + &pub_inputs, + &mut DefaultTranscript::::new(&[]), + ) + .unwrap(); + let total = start.elapsed(); + + println!("\n============================================================"); + println!( + "Lambda STARK Instruments (blowup={}, queries={})", + proof_options.blowup_factor, proof_options.fri_number_of_queries + ); + println!("Trace: {} rows x {} cols", rows, 2 * num_sequences); + println!("Total prove: {:.3}s", total.as_secs_f64()); + + #[cfg(feature = "instruments")] + if let Some(timing) = stark::instruments::take() { + println!("\n--- High-level phases ---"); + println!( + " Pre-pass: {:>8.1}ms", + timing.prepass.as_secs_f64() * 1000.0 + ); + println!( + " R1 Main commits: {:>8.1}ms", + timing.main_commits.as_secs_f64() * 1000.0 + ); + println!( + " R1 Aux build: {:>8.1}ms", + timing.aux_build.as_secs_f64() * 1000.0 + ); + println!( + " R1 Aux commit: {:>8.1}ms", + timing.aux_commit.as_secs_f64() * 1000.0 + ); + println!( + " Rounds 2-4: {:>8.1}ms", + timing.rounds_2_4.as_secs_f64() * 1000.0 + ); + + let r1 = &timing.round1_sub; + println!("\n--- Round 1 sub-ops ---"); + println!( + " Main LDE (FFT): {:>8.1}ms", + r1.main_lde.as_secs_f64() * 1000.0 + ); + println!( + " Main Merkle: {:>8.1}ms", + r1.main_merkle.as_secs_f64() * 1000.0 + ); + + for (name, tbl_rows, dur, sub) in &timing.table_timings { + println!( + "\n--- Rounds 2-4: {} ({} rows, {:.1}ms) ---", + name, + tbl_rows, + dur.as_secs_f64() * 1000.0 + ); + println!( + " R2 constraint eval:{:>8.1}ms ({:.0}%)", + sub.constraints.as_secs_f64() * 1000.0, + sub.constraints.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + println!( + " R2 decompose+ext: {:>8.1}ms ({:.0}%)", + sub.comp_decompose.as_secs_f64() * 1000.0, + sub.comp_decompose.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + println!( + " R2 comp Merkle: {:>8.1}ms ({:.0}%)", + sub.comp_commit.as_secs_f64() * 1000.0, + sub.comp_commit.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + println!( + " R3 OOD eval: {:>8.1}ms ({:.0}%)", + sub.ood.as_secs_f64() * 1000.0, + sub.ood.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + println!( + " R4 deep comp: {:>8.1}ms ({:.0}%)", + sub.deep_comp.as_secs_f64() * 1000.0, + sub.deep_comp.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + println!( + " R4 deep extend: {:>8.1}ms ({:.0}%)", + sub.deep_extend.as_secs_f64() * 1000.0, + sub.deep_extend.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + println!( + " R4 FRI commit: {:>8.1}ms ({:.0}%)", + sub.fri_commit.as_secs_f64() * 1000.0, + sub.fri_commit.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + println!( + " R4 queries+open: {:>8.1}ms ({:.0}%)", + sub.queries.as_secs_f64() * 1000.0, + sub.queries.as_secs_f64() / total.as_secs_f64() * 100.0 + ); + } + } + + #[cfg(not(feature = "instruments"))] + println!("(rebuild with --features instruments for breakdown)"); + + // --- Plonky3 breakdown via tracing spans --- + // Captures ALL spans (info + debug) so we see quotient_values, FRI commit, etc. + println!("\n============================================================"); + println!("Plonky3 STARK Span Breakdown"); + + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; + use tracing_subscriber::layer::SubscriberExt; + + type SpanResults = Arc>>; + + struct P3TimingLayer { + spans: Mutex)>>, + results: SpanResults, + } + + impl tracing_subscriber::registry::LookupSpan<'lookup>> + tracing_subscriber::Layer for P3TimingLayer + { + fn on_new_span( + &self, + attrs: &tracing::span::Attributes<'_>, + id: &tracing::span::Id, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + let name = attrs.metadata().name().to_string(); + self.spans + .lock() + .unwrap() + .insert(id.into_u64(), (name, None)); + } + + fn on_enter( + &self, + id: &tracing::span::Id, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) { + entry.1 = Some(std::time::Instant::now()); + } + } + + fn on_close( + &self, + id: tracing::span::Id, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + if let Some((name, Some(start))) = + self.spans.lock().unwrap().remove(&id.into_u64()) + { + let ms = start.elapsed().as_secs_f64() * 1000.0; + self.results.lock().unwrap().push((name, ms)); + } + } + } + + let results: SpanResults = Arc::new(Mutex::new(Vec::new())); + let layer = P3TimingLayer { + spans: Mutex::new(HashMap::new()), + results: Arc::clone(&results), + }; + let filter = tracing_subscriber::filter::LevelFilter::DEBUG; + let subscriber = tracing_subscriber::registry().with(filter).with(layer); + + let config = plonky3_config::matched_params_config(); + let p3_air = plonky3_fibonacci::P3FibonacciAir { num_sequences }; + let p3_trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows); + let p3_pis = plonky3_fibonacci::public_values(num_sequences); + + let p3_prove_dur; + { + let _guard = tracing::subscriber::set_default(subscriber); + let p3_start = std::time::Instant::now(); + let _p3_proof = p3_uni_stark::prove(&config, &p3_air, p3_trace, &p3_pis); + p3_prove_dur = p3_start.elapsed(); + } + + let total_ms = p3_prove_dur.as_secs_f64() * 1000.0; + println!(" Prove total: {:.1}ms\n", total_ms); + + // Sort spans by duration descending and print + let mut span_data = results.lock().unwrap().clone(); + span_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + for (name, ms) in &span_data { + if *ms >= 0.1 { + println!(" {:.<40} {:>8.1}ms ({:.0}%)", name, ms, ms / total_ms * 100.0); + } + } + let accounted: f64 = span_data.iter().map(|(_, ms)| ms).sum(); + let unaccounted = total_ms - accounted; + if unaccounted > 1.0 { + println!( + " {:.<40} {:>8.1}ms ({:.0}%)", + "(unaccounted)", + unaccounted, + unaccounted / total_ms * 100.0 + ); + } + println!("============================================================\n"); + } + + /// Verifies that the new Lambda pair AIR trace and the Plonky3 trace are + /// cell-by-cell identical at the same (row, col) coordinates. + #[test] + fn lambda_pair_trace_matches_plonky3_trace() { + let num_sequences = 3; + let rows = 16; + + let initial_values: Vec<(FE, FE)> = (0..num_sequences) + .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) + .collect(); + + let lambda_trace = + lambda_fibonacci_pair::compute_trace::(&initial_values, rows); + let p3_trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows); + + assert_eq!(p3_trace.width, 2 * num_sequences); + for row in 0..rows { + for seq in 0..num_sequences { + let p3_left = p3_trace.values[row * p3_trace.width + 2 * seq].as_canonical_u64(); + let p3_right = + p3_trace.values[row * p3_trace.width + 2 * seq + 1].as_canonical_u64(); + + assert_eq!( + FE::from(p3_left), + lambda_trace.get_main(row, 2 * seq).clone(), + "left mismatch at row {row}, seq {seq}" + ); + assert_eq!( + FE::from(p3_right), + lambda_trace.get_main(row, 2 * seq + 1).clone(), + "right mismatch at row {row}, seq {seq}" + ); + } + } + } +} diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs new file mode 100644 index 000000000..b74f18ad2 --- /dev/null +++ b/bench_vs_plonky3/src/plonky3_config.rs @@ -0,0 +1,92 @@ +use p3_challenger::{HashChallenger, SerializingChallenger64}; +use p3_commit::ExtensionMmcs; +use p3_dft::Radix2DitParallel; +use p3_field::extension::BinomialExtensionField; +use p3_fri::{FriParameters, TwoAdicFriPcs}; +use p3_goldilocks::Goldilocks; +use p3_keccak::{Keccak256Hash, KeccakF}; +use p3_merkle_tree::MerkleTreeMmcs; +use p3_symmetric::{CompressionFunctionFromHasher, PaddingFreeSponge, SerializingHasher}; +use p3_uni_stark::StarkConfig; + +pub type Val = Goldilocks; + +/// Cubic extension (default, `p3-degree3` feature): matches Lambda's +/// `Degree3GoldilocksExtensionField`, irreducible x^3 - 2. Needs the vendored +/// `p3-goldilocks-patched` crate (enabled via root `[patch.crates-io]`). +#[cfg(feature = "p3-degree3")] +pub type Challenge = BinomialExtensionField; + +/// Quadratic extension (vanilla upstream p3-goldilocks 0.5.2). Compiled when +/// `p3-degree3` is disabled, typically together with commenting the root +/// `[patch.crates-io]` block. Lambda still runs degree 3, so this is NOT a +/// fair comparison on the extension field — it is used for nightly tracking +/// against the off-the-shelf P3 config. +#[cfg(not(feature = "p3-degree3"))] +pub type Challenge = BinomialExtensionField; + +type ByteHash = Keccak256Hash; +type U64Hash = PaddingFreeSponge; +type FieldHash = SerializingHasher; +type MyCompress = CompressionFunctionFromHasher; +pub type ValMmcs = MerkleTreeMmcs< + [Val; p3_keccak::VECTOR_LEN], + [u64; p3_keccak::VECTOR_LEN], + FieldHash, + MyCompress, + 2, + 4, +>; +type ChallengeMmcs = ExtensionMmcs; +type Dft = Radix2DitParallel; +pub type Pcs = TwoAdicFriPcs; +pub type Challenger = SerializingChallenger64>; + +pub type P3Config = StarkConfig; + +fn build_mmcs() -> (ValMmcs, ChallengeMmcs, ByteHash) { + let byte_hash = ByteHash {}; + let u64_hash = U64Hash::new(KeccakF {}); + let field_hash = FieldHash::new(u64_hash); + let compress = MyCompress::new(u64_hash); + let val_mmcs = ValMmcs::new(field_hash, compress, 3); + let challenge_mmcs = ChallengeMmcs::new(val_mmcs.clone()); + (val_mmcs, challenge_mmcs, byte_hash) +} + +/// Creates a Plonky3 STARK config with parameters matched to Lambda's +/// production config `GoldilocksCubicProofOptions::with_blowup(2)`: +/// blowup=2, 219 FRI queries, grinding=0 (excluded from benchmark). +pub fn matched_params_config() -> P3Config { + let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs(); + let dft = Dft::default(); + let challenger = Challenger::from_hasher(vec![], byte_hash); + + // Match Lambda production: blowup=2, queries=219, grinding=0. + // Grinding excluded from benchmark (identical PoW on both sides). + let fri_params = FriParameters { + log_blowup: 1, // blowup = 2 + log_final_poly_len: 0, + max_log_arity: 1, + num_queries: 219, + commit_proof_of_work_bits: 0, + query_proof_of_work_bits: 0, + mmcs: challenge_mmcs, + }; + + let pcs = Pcs::new(dft, val_mmcs, fri_params); + P3Config::new(pcs, challenger) +} + +/// Creates a Plonky3 STARK config with Plonky3's standard benchmark parameters: +/// blowup=2, 100 FRI queries, 16-bit query PoW. +pub fn plonky3_benchmark_config() -> P3Config { + let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs(); + let dft = Dft::default(); + let challenger = Challenger::from_hasher(vec![], byte_hash); + + let fri_params = p3_fri::create_benchmark_fri_params(challenge_mmcs); + + let pcs = Pcs::new(dft, val_mmcs, fri_params); + P3Config::new(pcs, challenger) +} diff --git a/bench_vs_plonky3/src/plonky3_fibonacci.rs b/bench_vs_plonky3/src/plonky3_fibonacci.rs new file mode 100644 index 000000000..c55bca8c5 --- /dev/null +++ b/bench_vs_plonky3/src/plonky3_fibonacci.rs @@ -0,0 +1,144 @@ +use p3_air::{Air, AirBuilder, BaseAir, WindowAccess}; +use p3_field::PrimeCharacteristicRing; +use p3_goldilocks::Goldilocks; +use p3_matrix::dense::RowMajorMatrix; + +/// Multi-sequence Fibonacci AIR for Plonky3. +/// +/// Each sequence uses 2 columns (left, right) in a 2-row window, where each +/// Plonky3 row stores two consecutive Lambda rows: +/// local.left = x_{2i} +/// local.right = x_{2i+1} +/// next.left = x_{2i+2} = local.left + local.right +/// next.right = x_{2i+3} = local.right + next.left +/// +/// This packs two consecutive Lambda trace rows into one Plonky3 row. It is the +/// closest encoding of Lambda's `row + 2` Fibonacci transition available in +/// Plonky3's current/next-row AIR window while keeping the same committed cell +/// count. +/// +/// Boundary constraints at the first row pin each sequence's initial (a, b) +/// values against public inputs, matching Lambda's `FibonacciMultiColumnAIR`. +/// +/// Public values layout: `[a_0, b_0, a_1, b_1, ..., a_{N-1}, b_{N-1}]` +/// where `N = num_sequences`. +/// +/// For `num_sequences` sequences, the AIR has `2 * num_sequences` columns +/// and `2 * num_sequences` public values. +pub struct P3FibonacciAir { + pub num_sequences: usize, +} + +impl BaseAir for P3FibonacciAir { + fn width(&self) -> usize { + 2 * self.num_sequences + } + + fn num_public_values(&self) -> usize { + 2 * self.num_sequences + } +} + +impl Air for P3FibonacciAir { + fn eval(&self, builder: &mut AB) { + let main = builder.main(); + let local = main.current_slice(); + let next = main.next_slice(); + + // Collect (left, right, next_left, next_right, a, b) per sequence so that + // `pis`'s borrow on `builder` can end before we mutate `builder`. + let rows: Vec<( + AB::Var, + AB::Var, + AB::Var, + AB::Var, + AB::PublicVar, + AB::PublicVar, + )> = { + let pis = builder.public_values(); + (0..self.num_sequences) + .map(|seq| { + ( + local[2 * seq], + local[2 * seq + 1], + next[2 * seq], + next[2 * seq + 1], + pis[2 * seq], + pis[2 * seq + 1], + ) + }) + .collect() + }; + drop(main); + + for (left, right, next_left, next_right, a, b) in rows { + // Boundary: first row pins (left, right) = (a, b) + let mut when_first_row = builder.when_first_row(); + when_first_row.assert_eq(left, a); + when_first_row.assert_eq(right, b); + + let mut when_transition = builder.when_transition(); + // Advance two Lambda rows per Plonky3 row. + when_transition.assert_eq(next_left, left + right); + when_transition.assert_eq(next_right, right + next_left); + } + } +} + +/// Generates a Fibonacci trace for Plonky3. +/// +/// For `num_sequences` sequences and `num_rows` rows (must be power of 2), +/// produces a `RowMajorMatrix` with `2 * num_sequences` columns. +/// Use `rows_for_lambda_trace(lambda_trace_length)` when comparing against +/// Lambda's one-column-per-sequence trace. +/// +/// Each sequence `s` starts with initial values matching Lambda's +/// `create_initial_values()`: `left = s + 1`, `right = s + 2`. +pub fn generate_fibonacci_trace( + num_sequences: usize, + num_rows: usize, +) -> RowMajorMatrix { + assert!(num_rows.is_power_of_two(), "num_rows must be a power of 2"); + let width = 2 * num_sequences; + let mut values = vec![Goldilocks::ZERO; width * num_rows]; + + for seq in 0..num_sequences { + let mut left = Goldilocks::from_u64((seq + 1) as u64); + let mut right = Goldilocks::from_u64((seq + 2) as u64); + + for row in 0..num_rows { + values[row * width + 2 * seq] = left; + values[row * width + 2 * seq + 1] = right; + let next_left = left + right; + let next_right = right + next_left; + left = next_left; + right = next_right; + } + } + + RowMajorMatrix::new(values, width) +} + +/// Returns the number of packed Plonky3 rows for a Lambda trace length. +pub fn rows_for_lambda_trace(lambda_trace_length: usize) -> usize { + assert!( + lambda_trace_length >= 2, + "lambda_trace_length must contain at least two rows" + ); + assert!( + lambda_trace_length.is_power_of_two(), + "lambda_trace_length must be a power of 2" + ); + lambda_trace_length / 2 +} + +/// Builds public values matching `generate_fibonacci_trace`'s initial values: +/// `[a_0, b_0, a_1, b_1, ...] = [1, 2, 2, 3, 3, 4, ...]` +pub fn public_values(num_sequences: usize) -> Vec { + let mut pis = Vec::with_capacity(2 * num_sequences); + for seq in 0..num_sequences { + pis.push(Goldilocks::from_u64((seq + 1) as u64)); + pis.push(Goldilocks::from_u64((seq + 2) as u64)); + } + pis +} From 1fd1a58ec2edabc76c8f08aaeeffc3f8340ee524 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Thu, 16 Apr 2026 18:10:01 -0300 Subject: [PATCH 16/34] lint --- bench_vs_plonky3/benches/stark_comparison.rs | 36 +++++++------------ bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 12 ++----- bench_vs_plonky3/src/lib.rs | 24 +++++++------ bench_vs_plonky3/src/plonky3_fibonacci.rs | 21 ++++++----- 4 files changed, 40 insertions(+), 53 deletions(-) diff --git a/bench_vs_plonky3/benches/stark_comparison.rs b/bench_vs_plonky3/benches/stark_comparison.rs index fd90ae7b5..577664892 100644 --- a/bench_vs_plonky3/benches/stark_comparison.rs +++ b/bench_vs_plonky3/benches/stark_comparison.rs @@ -47,9 +47,7 @@ fn lambda_initial_values() -> Vec<(FE, FE)> { fn bench_lambda_prove(c: &mut Criterion) { let mut group = c.benchmark_group("lambda_stark/prove"); - group.throughput(Throughput::Elements( - (ROWS * 2 * NUM_SEQUENCES) as u64, - )); + group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64)); let proof_options = benchmark_proof_options(); group.bench_with_input( @@ -59,16 +57,13 @@ fn bench_lambda_prove(c: &mut Criterion) { b.iter_with_setup( || { let initial_values = lambda_initial_values(); - let trace = lambda_fibonacci_pair::compute_trace::( - &initial_values, - rows, - ); - let pub_inputs = - lambda_fibonacci_pair::create_public_inputs(initial_values); - let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( - &proof_options, - NUM_SEQUENCES, - ); + let trace = lambda_fibonacci_pair::compute_trace::(&initial_values, rows); + let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values); + let air = + lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( + &proof_options, + NUM_SEQUENCES, + ); (trace, pub_inputs, air) }, |(mut trace, pub_inputs, air)| { @@ -88,9 +83,7 @@ fn bench_lambda_prove(c: &mut Criterion) { fn bench_plonky3_prove(c: &mut Criterion) { let mut group = c.benchmark_group("plonky3_stark/prove"); - group.throughput(Throughput::Elements( - (ROWS * 2 * NUM_SEQUENCES) as u64, - )); + group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64)); group.bench_with_input( BenchmarkId::new("fibonacci", TRACE_LABEL), @@ -102,8 +95,7 @@ fn bench_plonky3_prove(c: &mut Criterion) { let air = plonky3_fibonacci::P3FibonacciAir { num_sequences: NUM_SEQUENCES, }; - let trace = - plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, rows); + let trace = plonky3_fibonacci::generate_fibonacci_trace(NUM_SEQUENCES, rows); let pis = plonky3_fibonacci::public_values(NUM_SEQUENCES); (config, air, trace, pis) }, @@ -116,9 +108,7 @@ fn bench_plonky3_prove(c: &mut Criterion) { fn bench_lambda_verify(c: &mut Criterion) { let mut group = c.benchmark_group("lambda_stark/verify"); - group.throughput(Throughput::Elements( - (ROWS * 2 * NUM_SEQUENCES) as u64, - )); + group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64)); let proof_options = benchmark_proof_options(); let initial_values = lambda_initial_values(); @@ -150,9 +140,7 @@ fn bench_lambda_verify(c: &mut Criterion) { fn bench_plonky3_verify(c: &mut Criterion) { let mut group = c.benchmark_group("plonky3_stark/verify"); - group.throughput(Throughput::Elements( - (ROWS * 2 * NUM_SEQUENCES) as u64, - )); + group.throughput(Throughput::Elements((ROWS * 2 * NUM_SEQUENCES) as u64)); let air = plonky3_fibonacci::P3FibonacciAir { num_sequences: NUM_SEQUENCES, diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs index 2f1fd4990..54c704976 100644 --- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs +++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs @@ -78,11 +78,7 @@ where 1 } - fn evaluate( - &self, - eval_ctx: &TransitionEvaluationContext, - out: &mut [FieldElement], - ) { + fn evaluate(&self, eval_ctx: &TransitionEvaluationContext, out: &mut [FieldElement]) { match eval_ctx { TransitionEvaluationContext::Prover { frame, .. } => { let s0 = frame.get_evaluation_step(0); @@ -151,11 +147,7 @@ where 1 } - fn evaluate( - &self, - eval_ctx: &TransitionEvaluationContext, - out: &mut [FieldElement], - ) { + fn evaluate(&self, eval_ctx: &TransitionEvaluationContext, out: &mut [FieldElement]) { match eval_ctx { TransitionEvaluationContext::Prover { frame, .. } => { let s0 = frame.get_evaluation_step(0); diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs index 224ad5fa9..31e9ef470 100644 --- a/bench_vs_plonky3/src/lib.rs +++ b/bench_vs_plonky3/src/lib.rs @@ -39,8 +39,7 @@ mod tests { .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) .collect(); - let mut trace = - lambda_fibonacci_pair::compute_trace::(&initial_values, trace_length); + let mut trace = lambda_fibonacci_pair::compute_trace::(&initial_values, trace_length); let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values); let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( &proof_options, @@ -88,8 +87,7 @@ mod tests { .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) .collect(); - let mut trace = - lambda_fibonacci_pair::compute_trace::(&initial_values, rows); + let mut trace = lambda_fibonacci_pair::compute_trace::(&initial_values, rows); let pub_inputs = lambda_fibonacci_pair::create_public_inputs(initial_values); let air = lambda_fibonacci_pair::FibonacciPairMultiColAIR::::with_num_sequences( &proof_options, @@ -218,8 +216,9 @@ mod tests { results: SpanResults, } - impl tracing_subscriber::registry::LookupSpan<'lookup>> - tracing_subscriber::Layer for P3TimingLayer + impl< + S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>, + > tracing_subscriber::Layer for P3TimingLayer { fn on_new_span( &self, @@ -249,8 +248,7 @@ mod tests { id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>, ) { - if let Some((name, Some(start))) = - self.spans.lock().unwrap().remove(&id.into_u64()) + if let Some((name, Some(start))) = self.spans.lock().unwrap().remove(&id.into_u64()) { let ms = start.elapsed().as_secs_f64() * 1000.0; self.results.lock().unwrap().push((name, ms)); @@ -287,7 +285,12 @@ mod tests { span_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); for (name, ms) in &span_data { if *ms >= 0.1 { - println!(" {:.<40} {:>8.1}ms ({:.0}%)", name, ms, ms / total_ms * 100.0); + println!( + " {:.<40} {:>8.1}ms ({:.0}%)", + name, + ms, + ms / total_ms * 100.0 + ); } } let accounted: f64 = span_data.iter().map(|(_, ms)| ms).sum(); @@ -314,8 +317,7 @@ mod tests { .map(|i| (FE::from((i + 1) as u64), FE::from((i + 2) as u64))) .collect(); - let lambda_trace = - lambda_fibonacci_pair::compute_trace::(&initial_values, rows); + let lambda_trace = lambda_fibonacci_pair::compute_trace::(&initial_values, rows); let p3_trace = plonky3_fibonacci::generate_fibonacci_trace(num_sequences, rows); assert_eq!(p3_trace.width, 2 * num_sequences); diff --git a/bench_vs_plonky3/src/plonky3_fibonacci.rs b/bench_vs_plonky3/src/plonky3_fibonacci.rs index c55bca8c5..b1f0816eb 100644 --- a/bench_vs_plonky3/src/plonky3_fibonacci.rs +++ b/bench_vs_plonky3/src/plonky3_fibonacci.rs @@ -39,6 +39,18 @@ impl BaseAir for P3FibonacciAir { } } +/// One sequence's (local_left, local_right, next_left, next_right, a, b) +/// snapshot extracted from an `AirBuilder`. Factored out to keep the +/// `Air::eval` signature readable (clippy::type_complexity). +type FibPairRow = ( + ::Var, + ::Var, + ::Var, + ::Var, + ::PublicVar, + ::PublicVar, +); + impl Air for P3FibonacciAir { fn eval(&self, builder: &mut AB) { let main = builder.main(); @@ -47,14 +59,7 @@ impl Air for P3FibonacciAir { // Collect (left, right, next_left, next_right, a, b) per sequence so that // `pis`'s borrow on `builder` can end before we mutate `builder`. - let rows: Vec<( - AB::Var, - AB::Var, - AB::Var, - AB::Var, - AB::PublicVar, - AB::PublicVar, - )> = { + let rows: Vec> = { let pis = builder.public_values(); (0..self.num_sequences) .map(|seq| { From c9f9df99fbb5c0baeee00ca1d9fb5bbbb6f397a7 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Thu, 16 Apr 2026 19:11:16 -0300 Subject: [PATCH 17/34] Replace summary.md with metrics.txt in bench_vs_plonky3 and add README --- .github/workflows/bench-vs-p3-nightly.yml | 3 -- bench_vs_plonky3/run.sh | 59 +++++++++++++---------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml index b8602d7d4..f7856a139 100644 --- a/.github/workflows/bench-vs-p3-nightly.yml +++ b/.github/workflows/bench-vs-p3-nightly.yml @@ -46,6 +46,3 @@ jobs: name: bench-vs-p3-nightly-${{ github.run_number }}-${{ github.sha }} path: bench_vs_p3_artifacts retention-days: 90 - - - name: Publish summary - run: cat bench_vs_p3_artifacts/summary.md >> "$GITHUB_STEP_SUMMARY" diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index 01e3a5306..445d2bf14 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -344,11 +344,13 @@ for i in "${!RESULT_LOG_ROWS[@]}"; do rt="${RESULT_RATIO[$i]}" if $RUN_LAMBDA && $RUN_P3; then color=$GREEN + verdict="Lambda faster" if awk -v l="$lt" -v p="$pt" 'BEGIN{ exit !(l+0 > p+0) }'; then color=$RED + verdict="P3 faster" fi - printf " %-9s %-12s %13ss %13ss ${color}%9sx${NC}\n" \ - "$lr" "$rows" "$lt" "$pt" "$rt" + printf " %-9s %-12s %13ss %13ss ${color}%9sx${NC} (${color}%s${NC})\n" \ + "$lr" "$rows" "$lt" "$pt" "$rt" "$verdict" elif $RUN_LAMBDA; then printf " %-9s %-12s %13ss\n" "$lr" "$rows" "$lt" else @@ -358,7 +360,8 @@ done echo "" if $RUN_LAMBDA && $RUN_P3; then - echo -e "Timing window: single-shot end-to-end prove. Ratio < 1 → Lambda faster." + echo -e "Timing window: single-shot end-to-end prove." + echo -e "Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster." fi if $NO_P3_PATCH; then echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2." @@ -368,6 +371,16 @@ fi # --- Machine-readable report ------------------------------------------------ if [ -n "$REPORT_DIR" ]; then + # Slash-joined helpers for metrics.txt (mirrors the format used by + # bench_vs/run.sh). + join_slash() { + local joined="" + for value in "$@"; do + joined="${joined:+$joined/}$value" + done + printf "%s\n" "$joined" + } + { printf "log_rows\trows\tlambda_median_s\tp3_median_s\tratio_lambda_over_p3\truns\n" for i in "${!RESULT_LOG_ROWS[@]}"; do @@ -382,29 +395,23 @@ if [ -n "$REPORT_DIR" ]; then } > "$REPORT_DIR/results.tsv" { - echo "# Lambda STARK vs Plonky3 Benchmark" - echo - echo "Timing window: \`single-shot end-to-end prove\` (no verification)." - echo "num-sequences: \`$NUM_SEQUENCES\`, columns: \`$((2 * NUM_SEQUENCES))\`, blowup: 2, fri_queries: 219, grinding: 0." - echo "runs per size: \`$RUNS\` (median reported)." - echo "arch: \`$(uname -m)\`, scalar mode: \`$($SCALAR && echo on || echo off)\`." + echo "arch=$(uname -m)" + echo "num_sequences=$NUM_SEQUENCES" + echo "columns=$((2 * NUM_SEQUENCES))" + echo "blowup=2" + echo "fri_queries=219" + echo "grinding=0" + echo "runs_per_size=$RUNS" + echo "p3_extension=$($NO_P3_PATCH && echo 'degree2_vanilla' || echo 'degree3_patched')" + echo "scalar=$($SCALAR && echo on || echo off)" if $SCALAR && [ -n "$SCALAR_RUSTFLAGS" ]; then - echo "RUSTFLAGS: \`$SCALAR_RUSTFLAGS\`." - fi - if $NO_P3_PATCH; then - echo - echo "> Plonky3 built without the vendored degree-3 patch: Challenge type is degree-2 (vanilla crates.io p3-goldilocks 0.5.2). Lambda still uses degree 3." + echo "rustflags=$SCALAR_RUSTFLAGS" fi - echo - echo "| log-rows | rows | Lambda (s) | P3 (s) | Lambda / P3 |" - echo "|---------:|-----:|-----------:|-------:|------------:|" - for i in "${!RESULT_LOG_ROWS[@]}"; do - printf "| %s | %s | %s | %s | %s |\n" \ - "${RESULT_LOG_ROWS[$i]}" \ - "${RESULT_ROWS[$i]}" \ - "${RESULT_LAMBDA[$i]}" \ - "${RESULT_P3[$i]}" \ - "${RESULT_RATIO[$i]}" - done - } > "$REPORT_DIR/summary.md" + echo "timing_window=single_shot_end_to_end_prove_no_verify" + echo "log_rows_series=$(join_slash "${RESULT_LOG_ROWS[@]}")" + echo "rows_series=$(join_slash "${RESULT_ROWS[@]}")" + echo "lambda_medians=$(join_slash "${RESULT_LAMBDA[@]}")" + echo "p3_medians=$(join_slash "${RESULT_P3[@]}")" + echo "ratios_lambda_over_p3=$(join_slash "${RESULT_RATIO[@]}")" + } > "$REPORT_DIR/metrics.txt" fi From ba4c7cd71b20341e53674bc6e977c66d625bde30 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Thu, 16 Apr 2026 19:33:44 -0300 Subject: [PATCH 18/34] remove line --- bench_vs_plonky3/run.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index 445d2bf14..550a83a77 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -303,13 +303,13 @@ for lr in "${LOG_ROWS[@]}"; do if $RUN_LAMBDA; then echo -ne " ${GREEN}[lambda]${NC} " lambda_median=$(run_prover lambda "$lr") - echo "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")" + echo -e "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")" fi if $RUN_P3; then echo -ne " ${GREEN}[p3]${NC} " p3_median=$(run_prover p3 "$lr") - echo "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")" + echo -e "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")" fi local_ratio="n/a" @@ -361,7 +361,6 @@ done echo "" if $RUN_LAMBDA && $RUN_P3; then echo -e "Timing window: single-shot end-to-end prove." - echo -e "Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster." fi if $NO_P3_PATCH; then echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2." From f7fe4d3e0750e773049d7ae81ac97ee432a9b3c7 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Thu, 16 Apr 2026 19:37:52 -0300 Subject: [PATCH 19/34] add README --- bench_vs_plonky3/README.md | 154 +++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 bench_vs_plonky3/README.md diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md new file mode 100644 index 000000000..04260876f --- /dev/null +++ b/bench_vs_plonky3/README.md @@ -0,0 +1,154 @@ +# Lambda STARK vs Plonky3 Benchmark + +Compares **single-shot end-to-end proving time** for an identical multi-sequence +Fibonacci AIR. Complements `bench_vs/` (which compares Lambda VM vs SP1 on a +full guest program) by isolating the STARK prover — no VM execution, no trace +builder, just one AIR and two provers. + +## What is measured + +Both provers prove the same AIR: + +- **Columns** = `2 × num_sequences` (default 16 sequences → 32 columns). +- **Rows** = `2 ^ log_rows` (default `19` → 524 288 rows). +- **Blowup** = 2 (matches Lambda production `GoldilocksCubicProofOptions::with_blowup(2)`). +- **FRI queries** = 219, grinding = 0. + +The timing window on both sides is **`Instant::now()` around `prove`, no +verification, no proof serialization**: + +| Phase | Lambda STARK | Plonky3 | +|--------------------------------------|:------------:|:-------:| +| Build AIR + trace | ❌ (outside) | ❌ (outside) | +| Build public inputs | ❌ (outside) | ❌ (outside) | +| Prove (Round 1 → Round 4) | ✅ | ✅ (`p3_uni_stark::prove`) | +| Proof serialize / disk write | ❌ | ❌ | +| Verify | ❌ | ❌ | + +Lambda's trace, public inputs, and AIR are constructed via +`lambda_fibonacci_pair::{compute_trace, create_public_inputs, FibonacciPairMultiColAIR}`. +Plonky3's counterpart uses `plonky3_fibonacci::{P3FibonacciAir, generate_fibonacci_trace, public_values}` +with `plonky3_config::matched_params_config`. Both AIRs are **cell-by-cell +equivalent** — this is asserted by the `lambda_pair_trace_matches_plonky3_trace` +test. + +## Prerequisites + +- Rust stable (the crate builds with `cargo build --release`). +- No SP1 toolchain needed — there's no VM guest compilation. +- For `--no-p3-patch` mode: a network-reachable crates.io (the script pulls + vanilla `p3-goldilocks 0.5.2` on demand). +- For default mode (with the degree-3 patch): the vendored crate at + `bench_vs_plonky3/p3-goldilocks-patched/` and the root `[patch.crates-io]` + entry pointing at it. + +## Usage + +```bash +# Default: log-rows=19, num-sequences=16, runs=3, with degree-3 patch, no scalar +./bench_vs_plonky3/run.sh + +# Size sweep +./bench_vs_plonky3/run.sh --log-rows 17 18 19 20 + +# Single prover +./bench_vs_plonky3/run.sh --lambda-only +./bench_vs_plonky3/run.sh --p3-only + +# Nightly-equivalent (vanilla P3 degree-2, scalar on both sides) +./bench_vs_plonky3/run.sh --no-p3-patch --scalar + +# Write machine-readable artifacts +./bench_vs_plonky3/run.sh --report-dir /tmp/p3_report --no-color +``` + +### Flags + +| Flag | Default | Effect | +|---|---|---| +| `--log-rows K [K ...]` | `19` | One or more power-of-2 row counts. | +| `--num-sequences N` | `16` | Number of Fibonacci sequences (columns = `2 × N`). | +| `--runs N` | `3` | Runs per `(size, prover)`; median is reported. | +| `--lambda-only` / `--p3-only` | both | Restrict to a single prover. | +| `--report-dir DIR` | — | Write TSV + metrics + raw stdouts. | +| `--no-p3-patch` | off | Comment the root `[patch.crates-io]` before building and restore on exit. Plonky3 compiles against vanilla crates.io `p3-goldilocks 0.5.2` (`BinomialExtensionField`). Lambda still runs degree 3 — the extension fields differ across sides but the AIRs stay identical. | +| `--scalar` | off | Pin `RUSTFLAGS` to disable SIMD on both sides. On `x86_64` drops AVX2 and AVX-512 (Goldilocks + most of Keccak go scalar, SSE2 residual on `p3-keccak`). On `aarch64` drops the `sha3` ISA extension (Keccak accelerator). | +| `--no-color` | off | Disable ANSI colors. | +| `-h` / `--help` | — | Print usage. | + +## Output + +Stdout (without `--report-dir`): + +``` +=== STARK prove benchmark: Lambda vs Plonky3 === + log-rows: 19 + num-sequences: 16 (columns = 32) + runs/size: 3 (median reported) + p3 extension: degree 2 (vanilla, no patch) + scalar mode: on (arch=x86_64, RUSTFLAGS="-C target-feature=-avx2,-avx512f") + +[build] prove_bench +--- log-rows=19 (rows = 524288) --- + [lambda] median 2.444s from 3 runs: 2.444,2.279,2.830 + [p3] median 0.988s from 3 runs: 0.981,0.993,0.988 + +=== Summary === + log-rows rows Lambda (s) P3 (s) L/P3 + -------- ---- ---------- ------ ---- + 19 524288 2.444s 0.988s 2.474x (P3 faster) + +Timing window: single-shot end-to-end prove. +Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster. +``` + +With `--report-dir DIR` the script writes: + +- `results.tsv` — tab-separated raw data (`log_rows, rows, lambda_median_s, + p3_median_s, ratio_lambda_over_p3, runs`). +- `metrics.txt` — key=value pairs with the config used (arch, scalar flag, + extension degree, blowup, queries, runs, rustflags) and the per-series + values slash-joined (so post-processing scripts can split easily). +- `raw/` — per-invocation stdouts (`{prover}_log{K}_run{i}.stdout`). + +No markdown file is generated — the TSV is the single source of truth for +downstream tooling. + +## Nightly + +A GitHub Actions workflow (`.github/workflows/bench-vs-p3-nightly.yml`) runs +daily at 07:30 UTC (04:30 Buenos Aires, after the SP1 nightly completes) on +the self-hosted `bench` runner. It executes: + +```bash +bash ./bench_vs_plonky3/run.sh \ + --log-rows 19 \ + --num-sequences 16 \ + --runs 3 \ + --no-p3-patch \ + --scalar \ + --report-dir bench_vs_p3_artifacts \ + --no-color +``` + +The `bench_vs_p3_artifacts/` directory is uploaded as an artifact named +`bench-vs-p3-nightly--` with 90-day retention. + +## Notes on fairness + +- **Extension field**: default mode uses the vendored `p3-goldilocks-patched` + (`BinomiallyExtendable<3>`, same `x^3 - 2` as Lambda). `--no-p3-patch` falls + back to upstream degree-2 — Lambda still runs degree-3, so the sides differ. + The nightly runs in the degree-2 mode to track the "shipped P3 vs shipped + Lambda" comparison. +- **Parallelism**: both provers are multi-threaded by default. Lambda pulls + rayon via `stark/parallel`; Plonky3 pulls rayon via + `p3-uni-stark` / `p3-dft` (hardcoded `features = ["parallel"]`, always on). +- **SIMD**: without `--scalar`, each side uses whatever target-features the + compiler decides from the host CPU. `--scalar` equalises Goldilocks on + `x86_64` (no AVX2/AVX-512) or disables the ARMv8.4 SHA3 Keccak extension on + `aarch64`. `p3-keccak`'s SSE2 path on x86 is not disabled. +- **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both + sides. Security models differ (Lambda: Johnson-bound, ~108 bits; P3: + conjectured, ~192 bits) — the compute work is equivalent, the claimed + soundness is not. See `ANALYSIS_LOG.md` for the full fairness audit. From e72772b5542474633e420bbcf8bb8f9dfd74a951 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Thu, 16 Apr 2026 19:46:23 -0300 Subject: [PATCH 20/34] Fix --no-p3-patch cleanup, --- bench_vs_plonky3/run.sh | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index 550a83a77..a0ace698d 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -127,12 +127,22 @@ fi # p3-goldilocks-patched (adds BinomiallyExtendable<3>, disables NEON). For the # nightly we build against vanilla crates.io p3-goldilocks — we comment the # block out and drop the `p3-degree3` feature. +# +# Both Cargo.toml AND Cargo.lock are backed up before the build: dropping the +# patch makes cargo re-resolve p3-goldilocks against crates.io, which rewrites +# Cargo.lock. The trap restores both so the working tree is clean on exit. CARGO_TOML="$ROOT_DIR/Cargo.toml" +CARGO_LOCK="$ROOT_DIR/Cargo.lock" CARGO_TOML_BAK="" +CARGO_LOCK_BAK="" BUILD_FEATURE_FLAGS=() if $NO_P3_PATCH; then CARGO_TOML_BAK="$CARGO_TOML.bak.p3bench.$$" cp "$CARGO_TOML" "$CARGO_TOML_BAK" + if [ -f "$CARGO_LOCK" ]; then + CARGO_LOCK_BAK="$CARGO_LOCK.bak.p3bench.$$" + cp "$CARGO_LOCK" "$CARGO_LOCK_BAK" + fi # Comment the [patch.crates-io] block and its entries (until the next blank # line or next [section]). python3 - "$CARGO_TOML" <<'PY' @@ -161,7 +171,7 @@ for ln in lines: out.append(ln) path.write_text("".join(out)) PY - trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi' EXIT INT TERM + trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi; if [ -n "$CARGO_LOCK_BAK" ] && [ -f "$CARGO_LOCK_BAK" ]; then mv "$CARGO_LOCK_BAK" "$CARGO_LOCK"; fi' EXIT INT TERM BUILD_FEATURE_FLAGS=(--no-default-features --features parallel) fi @@ -219,7 +229,13 @@ cargo build --release -p bench-vs-plonky3 --bin prove_bench \ --manifest-path "$ROOT_DIR/Cargo.toml" \ ${BUILD_FEATURE_FLAGS[@]+"${BUILD_FEATURE_FLAGS[@]}"} 2>&1 | tail -5 -BIN="$ROOT_DIR/target/release/prove_bench" +# Resolve the actual target directory via cargo metadata so we find the binary +# whether cargo used ./target/ (default) or a custom CARGO_TARGET_DIR. +TARGET_DIR=$(cargo metadata --manifest-path "$ROOT_DIR/Cargo.toml" \ + --format-version 1 --no-deps 2>/dev/null \ + | python3 -c 'import json, sys; print(json.load(sys.stdin)["target_directory"])' \ + 2>/dev/null || echo "$ROOT_DIR/target") +BIN="$TARGET_DIR/release/prove_bench" if [ ! -x "$BIN" ]; then echo -e "${RED}[build] prove_bench not produced at $BIN${NC}" exit 1 @@ -393,7 +409,18 @@ if [ -n "$REPORT_DIR" ]; then done } > "$REPORT_DIR/results.tsv" + # Capture commit + timestamp so the artifact is self-describing. + git_sha="$(git -C "$ROOT_DIR" rev-parse HEAD 2>/dev/null || echo unknown)" + git_dirty="clean" + if ! git -C "$ROOT_DIR" diff --quiet HEAD -- 2>/dev/null; then + git_dirty="dirty" + fi + timestamp_utc="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + { + echo "timestamp_utc=$timestamp_utc" + echo "git_sha=$git_sha" + echo "git_tree=$git_dirty" echo "arch=$(uname -m)" echo "num_sequences=$NUM_SEQUENCES" echo "columns=$((2 * NUM_SEQUENCES))" From ffefff4f9868a37b5f262d2854211ff48034d808 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Thu, 16 Apr 2026 20:16:05 -0300 Subject: [PATCH 21/34] Add breakdown section to README and match nightly size in instruments_breakdown --- bench_vs_plonky3/README.md | 42 +++++++++++++++++++++++++++++++++++++ bench_vs_plonky3/src/lib.rs | 2 +- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md index 04260876f..727d6cce9 100644 --- a/bench_vs_plonky3/README.md +++ b/bench_vs_plonky3/README.md @@ -134,6 +134,48 @@ bash ./bench_vs_plonky3/run.sh \ The `bench_vs_p3_artifacts/` directory is uploaded as an artifact named `bench-vs-p3-nightly--` with 90-day retention. +## Breakdown (per-phase timing) for manual analysis + +The nightly only reports wall-clock totals. When you need to see *where* the +time goes (constraint eval vs FFT vs FRI vs Merkle vs queries on the Lambda +side, and the per-span breakdown on the Plonky3 side), run the +`instruments_breakdown` test: + +```bash +# x86_64 (server), Goldilocks scalar: +RUSTFLAGS="-C target-feature=-avx2,-avx512f" \ +cargo test -p bench-vs-plonky3 --features instruments --release -- \ + instruments_breakdown --nocapture + +# aarch64 (M1), 100% scalar: +RUSTFLAGS="-C target-feature=-sha3" \ +cargo test -p bench-vs-plonky3 --features instruments --release -- \ + instruments_breakdown --nocapture +``` + +- `--features instruments` activates `stark/instruments` — without it, the + per-phase timers are no-ops and the Lambda breakdown prints zeros. +- `--release` is mandatory (debug numbers are meaningless). +- `--nocapture` is required to see the output (`cargo test` swallows stdout + otherwise). +- The test hardcodes `num_sequences = 16`, `rows = 1 << 19` (524 288), same + shape as the nightly, so the breakdown maps onto the nightly numbers. +- Output is split in two sections: + - **Lambda**: explicit per-phase totals (Pre-pass / R1 Main commits / R1 Aux + build+commit / Rounds 2-4) plus sub-ops (Main LDE, Main Merkle, constraint + eval, decompose+extend, composition Merkle, OOD, deep comp, deep extend, + FRI commit, queries+open). + - **Plonky3**: every `tracing` span emitted at DEBUG during + `p3_uni_stark::prove`, sorted by wall-clock descending, filtered ≥ 0.1 ms. + Spans nest (e.g. `prove ⊃ compute_quotient_values`), so Σspans > total is + expected and not a bug. `(unaccounted)` can be negative from nesting. + +Details of every timer (which method it wraps, where it lives) are in +[`INSTRUMENTATION.md`](INSTRUMENTATION.md). + +The nightly does **not** activate this path — it would add ~1 % overhead and +pollute the historical wall-clock numbers. + ## Notes on fairness - **Extension field**: default mode uses the vendored `p3-goldilocks-patched` diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs index 31e9ef470..d61c6ea9e 100644 --- a/bench_vs_plonky3/src/lib.rs +++ b/bench_vs_plonky3/src/lib.rs @@ -80,7 +80,7 @@ mod tests { #[test] fn instruments_breakdown() { let num_sequences = 16; - let rows = 1 << 18; + let rows = 1 << 19; let proof_options = benchmark_proof_options(); let initial_values: Vec<(FE, FE)> = (0..num_sequences) From e21518b09a6a7a87f943ddfc2ca4747df4374b08 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Sun, 19 Apr 2026 12:37:06 -0300 Subject: [PATCH 22/34] use fork --- .github/workflows/bench-vs-p3-nightly.yml | 1 - Cargo.lock | 231 +- Cargo.toml | 8 - bench_vs_plonky3/Cargo.toml | 44 +- bench_vs_plonky3/INSTRUMENTATION.md | 26 +- bench_vs_plonky3/README.md | 41 +- .../p3-goldilocks-patched/Cargo.toml | 129 - .../benches/bench_field.rs | 72 - .../benches/extension.rs | 40 - .../src/aarch64_neon/mds.rs | 343 --- .../src/aarch64_neon/mod.rs | 12 - .../src/aarch64_neon/packing.rs | 404 --- .../src/aarch64_neon/poseidon1.rs | 716 ----- .../src/aarch64_neon/poseidon1_asm.rs | 843 ------ .../src/aarch64_neon/poseidon2.rs | 652 ---- .../src/aarch64_neon/poseidon2_asm.rs | 2621 ----------------- .../src/aarch64_neon/utils.rs | 400 --- .../p3-goldilocks-patched/src/extension.rs | 217 -- .../p3-goldilocks-patched/src/goldilocks.rs | 813 ----- .../p3-goldilocks-patched/src/lib.rs | 42 - .../p3-goldilocks-patched/src/mds.rs | 761 ----- .../p3-goldilocks-patched/src/poseidon1.rs | 1143 ------- .../p3-goldilocks-patched/src/poseidon2.rs | 980 ------ .../src/x86_64_avx2/mds.rs | 86 - .../src/x86_64_avx2/mod.rs | 3 - .../src/x86_64_avx2/packing.rs | 539 ---- .../src/x86_64_avx512/mds.rs | 86 - .../src/x86_64_avx512/mod.rs | 3 - .../src/x86_64_avx512/packing.rs | 444 --- bench_vs_plonky3/run.sh | 112 +- bench_vs_plonky3/src/plonky3_config.rs | 17 +- 31 files changed, 171 insertions(+), 11658 deletions(-) delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs delete mode 100644 bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml index f7856a139..d27bd9010 100644 --- a/.github/workflows/bench-vs-p3-nightly.yml +++ b/.github/workflows/bench-vs-p3-nightly.yml @@ -35,7 +35,6 @@ jobs: --log-rows 19 \ --num-sequences 16 \ --runs 3 \ - --no-p3-patch \ --scalar \ --report-dir bench_vs_p3_artifacts \ --no-color diff --git a/Cargo.lock b/Cargo.lock index ae5305254..98bdb17b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -303,14 +303,14 @@ dependencies = [ "p3-air", "p3-challenger", "p3-commit", - "p3-dft 0.5.2", - "p3-field 0.5.2", + "p3-dft 0.5.1", + "p3-field 0.5.1", "p3-fri", "p3-goldilocks", "p3-keccak", - "p3-matrix 0.5.2", + "p3-matrix 0.5.1", "p3-merkle-tree", - "p3-symmetric 0.5.2", + "p3-symmetric 0.5.1", "p3-uni-stark", "stark", "tracing", @@ -2256,12 +2256,11 @@ dependencies = [ [[package]] name = "p3-air" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f2ec9cbfc642fc5173817287c3f8b789d07743b5f7e812d058b7a03e344f9ab" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ - "p3-field 0.5.2", - "p3-matrix 0.5.2", + "p3-field 0.5.1", + "p3-matrix 0.5.1", "tracing", ] @@ -2282,30 +2281,28 @@ dependencies = [ [[package]] name = "p3-challenger" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a0b490c745a7d2adeeafff06411814c8078c432740162332b3cd71be0158a76" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ - "p3-field 0.5.2", - "p3-maybe-rayon 0.5.2", + "p3-field 0.5.1", + "p3-maybe-rayon 0.5.1", "p3-monty-31", - "p3-symmetric 0.5.2", - "p3-util 0.5.2", + "p3-symmetric 0.5.1", + "p3-util 0.5.1", "tracing", ] [[package]] name = "p3-commit" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "916ae7989d5c3b49f887f5c55b2f9826bdbb81aaebf834503c4145d8b267c829" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", "p3-challenger", - "p3-dft 0.5.2", - "p3-field 0.5.2", - "p3-matrix 0.5.2", - "p3-util 0.5.2", + "p3-dft 0.5.1", + "p3-field 0.5.1", + "p3-matrix 0.5.1", + "p3-util 0.5.1", "serde", ] @@ -2324,15 +2321,14 @@ dependencies = [ [[package]] name = "p3-dft" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55301e91544440254977108b85c32c09d7ea05f2f0dd61092a2825339906a4a7" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", - "p3-field 0.5.2", - "p3-matrix 0.5.2", - "p3-maybe-rayon 0.5.2", - "p3-util 0.5.2", + "p3-field 0.5.1", + "p3-matrix 0.5.1", + "p3-maybe-rayon 0.5.1", + "p3-util 0.5.1", "spin 0.10.0", "tracing", ] @@ -2353,14 +2349,13 @@ dependencies = [ [[package]] name = "p3-field" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85affca7fc983889f260655c4cf74163eebb94605f702e4b6809ead707cba54f" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", "num-bigint 0.4.6", - "p3-maybe-rayon 0.5.2", - "p3-util 0.5.2", + "p3-maybe-rayon 0.5.1", + "p3-util 0.5.1", "paste", "rand 0.10.1", "serde", @@ -2369,19 +2364,17 @@ dependencies = [ [[package]] name = "p3-fri" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ac25574ed306b4c9ad1969faaecc0fe6081d45ad7e1ec236661a6e0e37b39e1" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", "p3-challenger", "p3-commit", - "p3-dft 0.5.2", - "p3-field 0.5.2", - "p3-interpolation", - "p3-matrix 0.5.2", - "p3-maybe-rayon 0.5.2", - "p3-util 0.5.2", + "p3-dft 0.5.1", + "p3-field 0.5.1", + "p3-matrix 0.5.1", + "p3-maybe-rayon 0.5.1", + "p3-util 0.5.1", "rand 0.10.1", "serde", "spin 0.10.0", @@ -2391,42 +2384,30 @@ dependencies = [ [[package]] name = "p3-goldilocks" -version = "0.5.2" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "num-bigint 0.4.6", "p3-challenger", - "p3-dft 0.5.2", - "p3-field 0.5.2", - "p3-mds 0.5.2", + "p3-dft 0.5.1", + "p3-field 0.5.1", + "p3-mds 0.5.1", "p3-poseidon1", - "p3-poseidon2 0.5.2", - "p3-symmetric 0.5.2", - "p3-util 0.5.2", + "p3-poseidon2 0.5.1", + "p3-symmetric 0.5.1", + "p3-util 0.5.1", "paste", "rand 0.10.1", "serde", ] -[[package]] -name = "p3-interpolation" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14fd48db63ff15f5e96dc46e6991dbc2d39431b82dcb154bad90f4579236e328" -dependencies = [ - "p3-field 0.5.2", - "p3-matrix 0.5.2", - "p3-maybe-rayon 0.5.2", - "p3-util 0.5.2", -] - [[package]] name = "p3-keccak" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebcf27615ece1995e4fcf4c69740f1cf515d1481367a20b4b3ce7f4f1b8d70f7" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ - "p3-symmetric 0.5.2", - "p3-util 0.5.2", + "p3-symmetric 0.5.1", + "p3-util 0.5.1", "tiny-keccak", ] @@ -2447,14 +2428,13 @@ dependencies = [ [[package]] name = "p3-matrix" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53428126b009071563d1d07305a9de8be0d21de00b57d2475289ee32ffca6577" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", - "p3-field 0.5.2", - "p3-maybe-rayon 0.5.2", - "p3-util 0.5.2", + "p3-field 0.5.1", + "p3-maybe-rayon 0.5.1", + "p3-util 0.5.1", "rand 0.10.1", "serde", "tracing", @@ -2468,9 +2448,8 @@ checksum = "c3968ad1160310296eb04f91a5f4edfa38fe1d6b2b8cd6b5c64e6f9b7370979e" [[package]] name = "p3-maybe-rayon" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "082bf467011c06c768c579ec6eb9accb5e1e62108891634cc770396e917f978a" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "rayon", ] @@ -2492,30 +2471,28 @@ dependencies = [ [[package]] name = "p3-mds" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35209e6214102ea6ec6b8cb1b9c15a9b8e597a39f9173597c957f123bced81b3" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ - "p3-dft 0.5.2", - "p3-field 0.5.2", - "p3-symmetric 0.5.2", - "p3-util 0.5.2", + "p3-dft 0.5.1", + "p3-field 0.5.1", + "p3-symmetric 0.5.1", + "p3-util 0.5.1", "rand 0.10.1", ] [[package]] name = "p3-merkle-tree" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "182a5383a54c50f47866f819946d28d95262f69967902734de8fdecb0d70c774" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", "p3-commit", - "p3-field 0.5.2", - "p3-matrix 0.5.2", - "p3-maybe-rayon 0.5.2", - "p3-symmetric 0.5.2", - "p3-util 0.5.2", + "p3-field 0.5.1", + "p3-matrix 0.5.1", + "p3-maybe-rayon 0.5.1", + "p3-symmetric 0.5.1", + "p3-util 0.5.1", "rand 0.10.1", "serde", "thiserror 2.0.17", @@ -2524,21 +2501,20 @@ dependencies = [ [[package]] name = "p3-monty-31" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffa8c99ec50c035020bbf5457c6a729ba6a975719c1a8dd3f16421081e4f650c" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", "num-bigint 0.4.6", - "p3-dft 0.5.2", - "p3-field 0.5.2", - "p3-matrix 0.5.2", - "p3-maybe-rayon 0.5.2", - "p3-mds 0.5.2", + "p3-dft 0.5.1", + "p3-field 0.5.1", + "p3-matrix 0.5.1", + "p3-maybe-rayon 0.5.1", + "p3-mds 0.5.1", "p3-poseidon1", - "p3-poseidon2 0.5.2", - "p3-symmetric 0.5.2", - "p3-util 0.5.2", + "p3-poseidon2 0.5.1", + "p3-symmetric 0.5.1", + "p3-util 0.5.1", "paste", "rand 0.10.1", "serde", @@ -2548,12 +2524,11 @@ dependencies = [ [[package]] name = "p3-poseidon1" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a018b618e3fa0aec8be933b1d8e404edd23f46991f6bf3f5c2f3f95e9413fe9" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ - "p3-field 0.5.2", - "p3-symmetric 0.5.2", + "p3-field 0.5.1", + "p3-symmetric 0.5.1", "rand 0.10.1", ] @@ -2573,14 +2548,13 @@ dependencies = [ [[package]] name = "p3-poseidon2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256a668a9ba916f8767552f13d0ba50d18968bc74a623bfdafa41e2970c944d0" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ - "p3-field 0.5.2", - "p3-mds 0.5.2", - "p3-symmetric 0.5.2", - "p3-util 0.5.2", + "p3-field 0.5.1", + "p3-mds 0.5.1", + "p3-symmetric 0.5.1", + "p3-util 0.5.1", "rand 0.10.1", ] @@ -2597,30 +2571,28 @@ dependencies = [ [[package]] name = "p3-symmetric" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c60a71a1507c13611b0f2b0b6e83669fd5b76f8e3115bcbced5ccfdf3ca7807" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", - "p3-field 0.5.2", - "p3-util 0.5.2", + "p3-field 0.5.1", + "p3-util 0.5.1", "serde", ] [[package]] name = "p3-uni-stark" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c4ecaad8a7b4cf0fc711278c7a29fdc6d14239157866b17feaf14061834bc51" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "itertools 0.14.0", "p3-air", "p3-challenger", "p3-commit", - "p3-field 0.5.2", - "p3-matrix 0.5.2", - "p3-maybe-rayon 0.5.2", - "p3-util 0.5.2", + "p3-field 0.5.1", + "p3-matrix 0.5.1", + "p3-maybe-rayon 0.5.1", + "p3-util 0.5.1", "serde", "thiserror 2.0.17", "tracing", @@ -2637,9 +2609,8 @@ dependencies = [ [[package]] name = "p3-util" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8b766b9e9254bf3fa98d76e42cf8a5b30628c182dfd5272d270076ee12f0fc0" +version = "0.5.1" +source = "git+https://github.com/yetanotherco/Plonky3.git?branch=feat%2Fgoldilocks_deg3#2f3f99f1a2765c4945ca7e89ff2231d10b32399a" dependencies = [ "serde", "transpose", diff --git a/Cargo.toml b/Cargo.toml index 886c206f2..031606010 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,11 +19,3 @@ debug = true # For profiling with samply/perf, build with: # CARGO_PROFILE_RELEASE_DEBUG=1 cargo build --release - -# Patched p3-goldilocks adds a BinomiallyExtendable<3> impl for degree-3 -# extension (same as Lambda's x^3 - 2) and disables NEON packing on aarch64. -# Used only by bench_vs_plonky3 for apples-to-apples comparisons against -# Lambda STARK. The nightly workflow comments this block out at CI time to -# benchmark vanilla p3-goldilocks (degree-2 extension). -[patch.crates-io] -p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" } diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml index a3d4e02e2..5b313106f 100644 --- a/bench_vs_plonky3/Cargo.toml +++ b/bench_vs_plonky3/Cargo.toml @@ -9,19 +9,24 @@ stark = { path = "../crypto/stark", features = ["test-utils"] } crypto = { path = "../crypto/crypto", features = ["std", "serde"] } math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"] } -# Plonky3 (all 0.5.2) -p3-air = "0.5.2" -p3-field = "0.5.2" -p3-goldilocks = "0.5.2" -p3-matrix = "0.5.2" -p3-commit = "0.5.2" -p3-challenger = "0.5.2" -p3-symmetric = "0.5.2" -p3-merkle-tree = "0.5.2" -p3-keccak = "0.5.2" -p3-fri = "0.5.2" -p3-uni-stark = { version = "0.5.2", features = ["parallel"] } -p3-dft = { version = "0.5.2", features = ["parallel"] } +# Plonky3: pinned to the yetanotherco fork, branch `feat/goldilocks_deg3`. +# The branch adds BinomiallyExtendable<3> for Goldilocks (x^3 - 2), matching +# Lambda's Degree3GoldilocksExtensionField. All p3-* crates MUST resolve to +# the same git source + ref; declaring any of them as a crates.io dep would +# pull in a second incompatible p3-field. cargo clones the fork once into +# ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time. +p3-air = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-field = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-goldilocks = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-matrix = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-commit = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-challenger = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-symmetric = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-keccak = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-fri = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-uni-stark = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] } +p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] } # Tracing for P3 span-based profiling tracing = "0.1" @@ -34,18 +39,11 @@ criterion = { version = "0.4", default-features = false } # Both provers run multi-threaded by default: Plonky3's `Radix2DitParallel` DFT # uses rayon unconditionally, so Lambda must also enable `parallel` for a fair # apples-to-apples comparison. Disable with `--no-default-features` to compare -# single-threaded. -# -# `p3-degree3` (default on) selects the cubic extension for Plonky3's -# Challenge type, matching Lambda's `Degree3GoldilocksExtensionField`. It -# requires the root `[patch.crates-io]` pointing at p3-goldilocks-patched. -# Disable it (`--no-default-features --features parallel`) together with -# commenting the patch block to build against vanilla crates.io -# p3-goldilocks (degree-2 extension). -default = ["parallel", "p3-degree3"] +# single-threaded. Cubic extension (`x^3 - 2`) matching Lambda is unconditional +# — the fork ships `BinomiallyExtendable<3>` for Goldilocks natively. +default = ["parallel"] parallel = ["stark/parallel"] instruments = ["stark/instruments"] -p3-degree3 = [] [[bin]] name = "prove_bench" diff --git a/bench_vs_plonky3/INSTRUMENTATION.md b/bench_vs_plonky3/INSTRUMENTATION.md index 0d82afe0e..b7b6bd4b1 100644 --- a/bench_vs_plonky3/INSTRUMENTATION.md +++ b/bench_vs_plonky3/INSTRUMENTATION.md @@ -10,14 +10,6 @@ El test que imprime el breakdown se llama `instruments_breakdown`. Hay que compilar con la feature `instruments` y pasar `--nocapture` porque la salida va a stdout (si no, `cargo test` se la come). -**M1 (100% scalar, fairest):** - -```bash -RUSTFLAGS="-C target-feature=-sha3" \ -cargo test -p bench-vs-plonky3 --features instruments --release -- \ - instruments_breakdown --nocapture -``` - **x86 (Goldilocks scalar, SSE2 Keccak residual en P3):** ```bash @@ -189,15 +181,9 @@ timings y aparecen en logs distintos. tiempo fuera de `multi_prove` (construcción de AIR, setup). 4. Los porcentajes de Plonky3 se calculan contra **`p3_prove_dur`** (solo el `prove`, sin setup). -5. El benchmark usa **degree 3** para la extensión de Plonky3 *sólo* si el - root `Cargo.toml` mantiene: - ```toml - [patch.crates-io] - p3-goldilocks = { path = "bench_vs_plonky3/p3-goldilocks-patched" } - ``` - (línea 26). Sin ese patch, P3 usa la extensión degree 2 de upstream y la - comparación deja de ser fair. -6. Plataforma: - - M1: `RUSTFLAGS="-C target-feature=-sha3"` → scalar en ambos lados. - - x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` → Goldilocks scalar, - residual SSE2 en Keccak de P3 (~7%). +5. El benchmark usa **degree 3** para la extensión de Plonky3 vía git deps a + la rama `feat/goldilocks_deg3` del fork `yetanotherco/Plonky3` (ver + `bench_vs_plonky3/Cargo.toml`), que provee `BinomiallyExtendable<3>` + para Goldilocks con el mismo irreducible `x^3 - 2` que Lambda. +6. Plataforma: x86 con `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` → + Goldilocks scalar, residual SSE2 en Keccak de P3 (~7%). diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md index 727d6cce9..fea3c8d7e 100644 --- a/bench_vs_plonky3/README.md +++ b/bench_vs_plonky3/README.md @@ -36,16 +36,16 @@ test. - Rust stable (the crate builds with `cargo build --release`). - No SP1 toolchain needed — there's no VM guest compilation. -- For `--no-p3-patch` mode: a network-reachable crates.io (the script pulls - vanilla `p3-goldilocks 0.5.2` on demand). -- For default mode (with the degree-3 patch): the vendored crate at - `bench_vs_plonky3/p3-goldilocks-patched/` and the root `[patch.crates-io]` - entry pointing at it. +- Read access to `https://github.com/yetanotherco/Plonky3.git` (branch + `feat/goldilocks_deg3`). Cargo clones it into `~/.cargo/git/db` on the + first build and `Cargo.lock` pins the SHA. The branch provides + `BinomiallyExtendable<3>` for Goldilocks (`x^3 - 2`, matching Lambda's + `Degree3GoldilocksExtensionField`). ## Usage ```bash -# Default: log-rows=19, num-sequences=16, runs=3, with degree-3 patch, no scalar +# Default: log-rows=19, num-sequences=16, runs=3, cubic extension, no scalar ./bench_vs_plonky3/run.sh # Size sweep @@ -55,8 +55,8 @@ test. ./bench_vs_plonky3/run.sh --lambda-only ./bench_vs_plonky3/run.sh --p3-only -# Nightly-equivalent (vanilla P3 degree-2, scalar on both sides) -./bench_vs_plonky3/run.sh --no-p3-patch --scalar +# Scalar mode on both sides (x86_64 only — disables AVX2/AVX-512) +./bench_vs_plonky3/run.sh --scalar # Write machine-readable artifacts ./bench_vs_plonky3/run.sh --report-dir /tmp/p3_report --no-color @@ -71,8 +71,7 @@ test. | `--runs N` | `3` | Runs per `(size, prover)`; median is reported. | | `--lambda-only` / `--p3-only` | both | Restrict to a single prover. | | `--report-dir DIR` | — | Write TSV + metrics + raw stdouts. | -| `--no-p3-patch` | off | Comment the root `[patch.crates-io]` before building and restore on exit. Plonky3 compiles against vanilla crates.io `p3-goldilocks 0.5.2` (`BinomialExtensionField`). Lambda still runs degree 3 — the extension fields differ across sides but the AIRs stay identical. | -| `--scalar` | off | Pin `RUSTFLAGS` to disable SIMD on both sides. On `x86_64` drops AVX2 and AVX-512 (Goldilocks + most of Keccak go scalar, SSE2 residual on `p3-keccak`). On `aarch64` drops the `sha3` ISA extension (Keccak accelerator). | +| `--scalar` | off | Pin `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` so Goldilocks (and most of Keccak) run scalar on both sides. x86_64 only; on other archs the flag is ignored with a warning. Residual SSE2 on `p3-keccak` remains (~7% of total prove time). | | `--no-color` | off | Disable ANSI colors. | | `-h` / `--help` | — | Print usage. | @@ -85,7 +84,7 @@ Stdout (without `--report-dir`): log-rows: 19 num-sequences: 16 (columns = 32) runs/size: 3 (median reported) - p3 extension: degree 2 (vanilla, no patch) + p3 extension: degree 3 (forked p3-goldilocks, matches Lambda) scalar mode: on (arch=x86_64, RUSTFLAGS="-C target-feature=-avx2,-avx512f") [build] prove_bench @@ -125,7 +124,6 @@ bash ./bench_vs_plonky3/run.sh \ --log-rows 19 \ --num-sequences 16 \ --runs 3 \ - --no-p3-patch \ --scalar \ --report-dir bench_vs_p3_artifacts \ --no-color @@ -144,11 +142,6 @@ side, and the per-span breakdown on the Plonky3 side), run the ```bash # x86_64 (server), Goldilocks scalar: RUSTFLAGS="-C target-feature=-avx2,-avx512f" \ -cargo test -p bench-vs-plonky3 --features instruments --release -- \ - instruments_breakdown --nocapture - -# aarch64 (M1), 100% scalar: -RUSTFLAGS="-C target-feature=-sha3" \ cargo test -p bench-vs-plonky3 --features instruments --release -- \ instruments_breakdown --nocapture ``` @@ -178,18 +171,16 @@ pollute the historical wall-clock numbers. ## Notes on fairness -- **Extension field**: default mode uses the vendored `p3-goldilocks-patched` - (`BinomiallyExtendable<3>`, same `x^3 - 2` as Lambda). `--no-p3-patch` falls - back to upstream degree-2 — Lambda still runs degree-3, so the sides differ. - The nightly runs in the degree-2 mode to track the "shipped P3 vs shipped - Lambda" comparison. +- **Extension field**: Plonky3 runs `BinomialExtensionField` + with the same `x^3 - 2` irreducible as Lambda's + `Degree3GoldilocksExtensionField`. Both sides use the same cubic extension. - **Parallelism**: both provers are multi-threaded by default. Lambda pulls rayon via `stark/parallel`; Plonky3 pulls rayon via `p3-uni-stark` / `p3-dft` (hardcoded `features = ["parallel"]`, always on). - **SIMD**: without `--scalar`, each side uses whatever target-features the - compiler decides from the host CPU. `--scalar` equalises Goldilocks on - `x86_64` (no AVX2/AVX-512) or disables the ARMv8.4 SHA3 Keccak extension on - `aarch64`. `p3-keccak`'s SSE2 path on x86 is not disabled. + compiler decides from the host CPU. `--scalar` (x86_64 only) disables AVX2 + and AVX-512 so Goldilocks arithmetic is scalar on both sides. `p3-keccak`'s + SSE2 path on x86 is not disabled. - **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both sides. Security models differ (Lambda: Johnson-bound, ~108 bits; P3: conjectured, ~192 bits) — the compute work is equivalent, the claimed diff --git a/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml b/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml deleted file mode 100644 index 768a2bb5a..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/Cargo.toml +++ /dev/null @@ -1,129 +0,0 @@ -# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO -# -# When uploading crates to the registry Cargo will automatically -# "normalize" Cargo.toml files for maximal compatibility -# with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies. -# -# If you are reading this file be aware that the original Cargo.toml -# will likely look very different (and much more reasonable). -# See Cargo.toml.orig for the original contents. - -[package] -edition = "2024" -name = "p3-goldilocks" -version = "0.5.2" -build = false -autolib = false -autobins = false -autoexamples = false -autotests = false -autobenches = false -description = "An implementation of the Goldilocks prime field F_p, where p = 2^64 - 2^32 + 1." -homepage = "https://github.com/Plonky3/Plonky3" -readme = false -keywords = [ - "cryptography", - "SNARK", - "PLONK", - "FRI", - "plonky3", -] -categories = ["cryptography::cryptocurrencies"] -license = "MIT OR Apache-2.0" -repository = "https://github.com/Plonky3/Plonky3" -resolver = "2" - -[lib] -name = "p3_goldilocks" -path = "src/lib.rs" - -[[bench]] -name = "bench_field" -path = "benches/bench_field.rs" -harness = false - -[[bench]] -name = "extension" -path = "benches/extension.rs" -harness = false - -[dependencies.num-bigint] -version = "0.4.6" -default-features = false - -[dependencies.p3-challenger] -version = "0.5.2" - -[dependencies.p3-dft] -version = "0.5.2" - -[dependencies.p3-field] -version = "0.5.2" - -[dependencies.p3-mds] -version = "0.5.2" - -[dependencies.p3-poseidon1] -version = "0.5.2" - -[dependencies.p3-poseidon2] -version = "0.5.2" - -[dependencies.p3-symmetric] -version = "0.5.2" - -[dependencies.p3-util] -version = "0.5.2" - -[dependencies.paste] -version = "1.0.15" - -[dependencies.rand] -version = "0.10.0" -default-features = false - -[dependencies.serde] -version = "1.0" -features = ["derive"] -default-features = false - -[dev-dependencies.criterion] -version = "0.8" - -[dev-dependencies.proptest] -version = "1.10" - -[dev-dependencies.rand] -version = "0.10.0" -default-features = false - -[lints.clippy] -cognitive_complexity = "allow" -match_bool = "warn" -needless_pass_by_value = "warn" -redundant_pub_crate = "allow" -semicolon_if_nothing_returned = "warn" -too_long_first_doc_paragraph = "allow" -transmute_undefined_repr = "allow" -tuple_array_conversions = "allow" -unused_peekable = "allow" - -[lints.clippy.all] -level = "warn" -priority = -1 - -[lints.clippy.nursery] -level = "warn" -priority = -1 - -[lints.rust] -rust_2024_incompatible_pat = "warn" -unused_must_use = "deny" - -[lints.rust.rust_2018_idioms] -level = "deny" -priority = -1 - -[lints.rustdoc] -all = "warn" diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs deleted file mode 100644 index a0d5e05f4..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/benches/bench_field.rs +++ /dev/null @@ -1,72 +0,0 @@ -use core::any::type_name; - -use criterion::{BatchSize, Criterion, criterion_group, criterion_main}; -use p3_field::{Field, PrimeCharacteristicRing}; -use p3_field_testing::bench_func::{ - benchmark_add_latency, benchmark_add_throughput, benchmark_chunked_linear_combination, - benchmark_inv, benchmark_iter_sum, benchmark_sub_latency, benchmark_sub_throughput, -}; -use p3_field_testing::{ - benchmark_dot_array, benchmark_mul_latency, benchmark_mul_throughput, benchmark_sum_array, -}; -use p3_goldilocks::Goldilocks; -use rand::rngs::SmallRng; -use rand::{RngExt, SeedableRng}; - -type F = Goldilocks; - -fn bench_field(c: &mut Criterion) { - let name = "Goldilocks"; - const REPS: usize = 200; - benchmark_mul_latency::(c, name); - benchmark_mul_throughput::(c, name); - benchmark_inv::(c, name); - benchmark_iter_sum::(c, name); - benchmark_sum_array::(c, name); - - benchmark_dot_array::(c, name); - benchmark_dot_array::(c, name); - benchmark_dot_array::(c, name); - benchmark_dot_array::(c, name); - benchmark_dot_array::(c, name); - benchmark_dot_array::(c, name); - - // Note that each round of throughput has 10 operations - // So we should have 10 * more repetitions for latency tests. - const L_REPS: usize = 10 * REPS; - benchmark_add_latency::(c, name); - benchmark_add_throughput::(c, name); - benchmark_sub_latency::(c, name); - benchmark_sub_throughput::(c, name); - - benchmark_chunked_linear_combination::(c, name); - - let mut rng = SmallRng::seed_from_u64(1); - c.bench_function("7th_root", |b| { - b.iter_batched( - || rng.random::(), - |x| x.exp_u64(10540996611094048183), - BatchSize::SmallInput, - ); - }); -} -fn bench_packedfield(c: &mut Criterion) { - let name = type_name::<::Packing>().to_string(); - // Note that each round of throughput has 10 operations - // So we should have 10 * more repetitions for latency tests. - const REPS: usize = 100; - const L_REPS: usize = 10 * REPS; - - benchmark_add_latency::<::Packing, L_REPS>(c, &name); - benchmark_add_throughput::<::Packing, REPS>(c, &name); - benchmark_sub_latency::<::Packing, L_REPS>(c, &name); - benchmark_sub_throughput::<::Packing, REPS>(c, &name); - benchmark_mul_latency::<::Packing, L_REPS>(c, &name); - benchmark_mul_throughput::<::Packing, REPS>(c, &name); - - type PF = ::Packing; - benchmark_chunked_linear_combination::(c, &name); -} - -criterion_group!(goldilocks_arithmetic, bench_field, bench_packedfield); -criterion_main!(goldilocks_arithmetic); diff --git a/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs deleted file mode 100644 index f4bf7e750..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/benches/extension.rs +++ /dev/null @@ -1,40 +0,0 @@ -use criterion::{Criterion, criterion_group, criterion_main}; -use p3_field::extension::BinomialExtensionField; -use p3_field_testing::bench_func::{ - benchmark_inv, benchmark_mul_latency, benchmark_mul_throughput, benchmark_square, -}; -use p3_field_testing::benchmark_mul; -use p3_goldilocks::Goldilocks; - -type EF2 = BinomialExtensionField; -type EF5 = BinomialExtensionField; - -// Note that each round of throughput has 10 operations -// So we should have 10 * more repetitions for latency tests. -const REPS: usize = 50; -const L_REPS: usize = 10 * REPS; - -fn bench_quadratic_extension(c: &mut Criterion) { - let name = "BinomialExtensionField"; - benchmark_square::(c, name); - benchmark_inv::(c, name); - benchmark_mul::(c, name); - benchmark_mul_throughput::(c, name); - benchmark_mul_latency::(c, name); -} - -fn bench_quintic_extension(c: &mut Criterion) { - let name = "BinomialExtensionField"; - benchmark_square::(c, name); - benchmark_inv::(c, name); - benchmark_mul::(c, name); - benchmark_mul_throughput::(c, name); - benchmark_mul_latency::(c, name); -} - -criterion_group!( - bench_goldilocks_ef, - bench_quadratic_extension, - bench_quintic_extension -); -criterion_main!(bench_goldilocks_ef); diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs deleted file mode 100644 index 9d4b410d3..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mds.rs +++ /dev/null @@ -1,343 +0,0 @@ -//! MDS permutation for Goldilocks on aarch64. - -use core::arch::aarch64::*; -use core::mem::transmute; - -use p3_mds::MdsPermutation; -use p3_symmetric::Permutation; - -use super::packing::PackedGoldilocksNeon; -use super::utils::{pack_lanes, unpack_lanes}; -use crate::{Goldilocks, MdsMatrixGoldilocks}; - -// --------------------------------------------------------------------------- -// Packed MdsMatrixGoldilocks (delegates to scalar Karatsuba per lane) -// --------------------------------------------------------------------------- - -/// Apply the scalar MDS to each lane of a packed NEON state independently. -#[inline] -fn mds_packed( - mds: &MdsMatrixGoldilocks, - input: &mut [PackedGoldilocksNeon; WIDTH], -) where - MdsMatrixGoldilocks: Permutation<[Goldilocks; WIDTH]>, -{ - let (mut lane0, mut lane1) = unpack_lanes(input); - unsafe { - mds.permute_mut(&mut *(&mut lane0 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH])); - mds.permute_mut(&mut *(&mut lane1 as *mut [u64; WIDTH] as *mut [Goldilocks; WIDTH])); - } - pack_lanes(input, &lane0, &lane1); -} - -impl Permutation<[PackedGoldilocksNeon; 8]> for MdsMatrixGoldilocks { - fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 8]) { - mds_packed(self, input); - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -impl Permutation<[PackedGoldilocksNeon; 12]> for MdsMatrixGoldilocks { - fn permute_mut(&self, input: &mut [PackedGoldilocksNeon; 12]) { - mds_packed(self, input); - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -// --------------------------------------------------------------------------- -// NEON-accelerated circulant MDS (16-bit chunk multiply-accumulate) -// --------------------------------------------------------------------------- - -/// Goldilocks identity: `2^64 ≡ 2^32 − 1 (mod P)`. -const EPSILON_U32: u32 = 0xffffffff; - -/// Reduce two accumulated 4×32-bit chunk vectors back to Goldilocks field -/// elements. Each `uint32x4_t` holds four 32-bit accumulators representing -/// the four 16-bit chunks of a Goldilocks element: -/// -/// ```text -/// elem = c[0] + c[1]·2¹⁶ + c[2]·2³² + c[3]·2⁴⁸ -/// ``` -/// -/// Returns two Goldilocks values packed in a `uint64x2_t`. -/// -/// Ported from plonky2. -#[inline(always)] -unsafe fn mds_reduce([cumul_a, cumul_b]: [uint32x4_t; 2]) -> uint64x2_t { - unsafe { - let mut lo = vreinterpretq_u64_u32(vuzp1q_u32(cumul_a, cumul_b)); - let mut hi = vreinterpretq_u64_u32(vuzp2q_u32(cumul_a, cumul_b)); - - hi = vsraq_n_u64::<16>(hi, lo); - lo = vsliq_n_u64::<16>(lo, hi); - - let top = { - let hi_u8 = vreinterpretq_u8_u64(hi); - let top_idx = - transmute::<[u8; 8], uint8x8_t>([0x06, 0x07, 0xff, 0xff, 0x0e, 0x0f, 0xff, 0xff]); - let top_u8 = vqtbl1_u8(hi_u8, top_idx); - vreinterpret_u32_u8(top_u8) - }; - - let adj_lo = vmlal_n_u32(lo, top, EPSILON_U32); - let wraparound_mask = vcgtq_u64(lo, adj_lo); - vsraq_n_u64::<32>(adj_lo, wraparound_mask) - } -} - -/// NEON-accelerated width-8 circulant MDS. -/// -/// Circulant first row: `[7, 1, 3, 8, 8, 3, 4, 9]` -/// (matches `MATRIX_CIRC_MDS_8_SML_ROW`). -#[inline(always)] -pub unsafe fn mds_neon_w8(state: &[u64; 8]) -> [u64; 8] { - unsafe { - const ROW: [u32; 8] = [7, 1, 3, 8, 8, 3, 4, 9]; - - const M: [[u32; 8]; 8] = { - let mut m = [[0u32; 8]; 8]; - let mut i = 0; - while i < 8 { - let mut j = 0; - while j < 8 { - m[i][j] = ROW[(j + 8 - i) % 8]; - j += 1; - } - i += 1; - } - m - }; - - let c: [uint32x4_t; 8] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i]))); - - let mut res = [0u64; 8]; - - let mut pair = 0; - while pair < 4 { - let i0 = 2 * pair; - let i1 = i0 + 1; - - let mut a0 = vdupq_n_u32(0); - let mut a1 = vdupq_n_u32(0); - - let mut j = 0; - while j < 8 { - a0 = vmlaq_n_u32(a0, c[j], M[i0][j]); - a1 = vmlaq_n_u32(a1, c[j], M[i1][j]); - j += 1; - } - - let r = mds_reduce([a0, a1]); - res[i0] = vgetq_lane_u64::<0>(r); - res[i1] = vgetq_lane_u64::<1>(r); - pair += 1; - } - - res - } -} - -/// NEON-accelerated width-12 circulant MDS. -/// -/// Circulant first row: `[1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]` -/// (matches `MATRIX_CIRC_MDS_12_SML_ROW`). -#[inline(always)] -pub unsafe fn mds_neon_w12(state: &[u64; 12]) -> [u64; 12] { - unsafe { - const ROW: [u32; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]; - - const M: [[u32; 12]; 12] = { - let mut m = [[0u32; 12]; 12]; - let mut i = 0; - while i < 12 { - let mut j = 0; - while j < 12 { - m[i][j] = ROW[(j + 12 - i) % 12]; - j += 1; - } - i += 1; - } - m - }; - - let c: [uint32x4_t; 12] = core::array::from_fn(|i| vmovl_u16(vcreate_u16(state[i]))); - - let mut res = [0u64; 12]; - - let mut pair = 0; - while pair < 6 { - let i0 = 2 * pair; - let i1 = i0 + 1; - - let mut a0 = vdupq_n_u32(0); - let mut a1 = vdupq_n_u32(0); - - let mut j = 0; - while j < 12 { - a0 = vmlaq_n_u32(a0, c[j], M[i0][j]); - a1 = vmlaq_n_u32(a1, c[j], M[i1][j]); - j += 1; - } - - let r = mds_reduce([a0, a1]); - res[i0] = vgetq_lane_u64::<0>(r); - res[i1] = vgetq_lane_u64::<1>(r); - pair += 1; - } - - res - } -} - -/// NEON-accelerated MDS wrapper for use with the generic Poseidon1. -/// -/// Zero-sized type that implements `Permutation<[Goldilocks; 8]>` and -/// `Permutation<[Goldilocks; 12]>` using the NEON chunk technique. Plugs -/// into `Poseidon1ExternalLayerGeneric` to accelerate full-round MDS while -/// keeping LLVM-optimized partial rounds from the generic Poseidon1. -#[derive(Clone, Debug, Default)] -pub struct MdsNeonGoldilocks; - -impl Permutation<[Goldilocks; 8]> for MdsNeonGoldilocks { - fn permute_mut(&self, state: &mut [Goldilocks; 8]) { - let raw = unsafe { &*(state as *const [Goldilocks; 8] as *const [u64; 8]) }; - let result = unsafe { mds_neon_w8(raw) }; - *unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) } = result; - } -} - -impl Permutation<[Goldilocks; 12]> for MdsNeonGoldilocks { - fn permute_mut(&self, state: &mut [Goldilocks; 12]) { - let raw = unsafe { &*(state as *const [Goldilocks; 12] as *const [u64; 12]) }; - let result = unsafe { mds_neon_w12(raw) }; - *unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) } = result; - } -} - -#[cfg(test)] -mod tests { - use p3_field::PrimeField64; - use p3_symmetric::Permutation; - use rand::rngs::SmallRng; - use rand::{RngExt, SeedableRng}; - - use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksNeon}; - - type F = Goldilocks; - - // -- Packed MdsMatrixGoldilocks tests -- - - macro_rules! test_neon_mds { - ($name:ident, $width:literal) => { - #[test] - fn $name() { - let mut rng = SmallRng::seed_from_u64(1); - let mds = MdsMatrixGoldilocks; - - let input: [Goldilocks; $width] = rng.random(); - let expected = mds.permute(input); - - let packed_input = input.map(Into::::into); - let packed_output = mds.permute(packed_input); - - let neon_output = packed_output.map(|x| x.0[0]); - assert_eq!(neon_output, expected); - } - }; - } - - test_neon_mds!(test_neon_mds_width_8, 8); - test_neon_mds!(test_neon_mds_width_12, 12); - - // -- NEON MDS correctness tests -- - - #[test] - fn test_mds_neon_w8_matches_karatsuba() { - let mds = MdsMatrixGoldilocks; - let mut rng = SmallRng::seed_from_u64(42); - - for _ in 0..100 { - let input: [F; 8] = rng.random(); - let expected = mds.permute(input); - - let raw: [u64; 8] = input.map(|x| x.as_canonical_u64()); - let result = unsafe { super::mds_neon_w8(&raw) }; - - for i in 0..8 { - assert_eq!( - F::new(result[i]).as_canonical_u64(), - expected[i].as_canonical_u64(), - "NEON MDS w8 mismatch at index {i}" - ); - } - } - } - - #[test] - fn test_mds_neon_w12_matches_karatsuba() { - let mds = MdsMatrixGoldilocks; - let mut rng = SmallRng::seed_from_u64(43); - - for _ in 0..100 { - let input: [F; 12] = rng.random(); - let expected = mds.permute(input); - - let raw: [u64; 12] = input.map(|x| x.as_canonical_u64()); - let result = unsafe { super::mds_neon_w12(&raw) }; - - for i in 0..12 { - assert_eq!( - F::new(result[i]).as_canonical_u64(), - expected[i].as_canonical_u64(), - "NEON MDS w12 mismatch at index {i}" - ); - } - } - } - - #[test] - fn test_mds_neon_boundary_w8() { - let mds = MdsMatrixGoldilocks; - let p_minus_1 = F::ORDER_U64 - 1; - - for &val in &[0u64, 1, p_minus_1] { - let input: [F; 8] = [F::new(val); 8]; - let expected = mds.permute(input); - - let raw = [val; 8]; - let result = unsafe { super::mds_neon_w8(&raw) }; - - for i in 0..8 { - assert_eq!( - F::new(result[i]).as_canonical_u64(), - expected[i].as_canonical_u64(), - "NEON MDS w8 boundary mismatch at index {i} for value {val}" - ); - } - } - } - - #[test] - fn test_mds_neon_boundary_w12() { - let mds = MdsMatrixGoldilocks; - let p_minus_1 = F::ORDER_U64 - 1; - - for &val in &[0u64, 1, p_minus_1] { - let input: [F; 12] = [F::new(val); 12]; - let expected = mds.permute(input); - - let raw = [val; 12]; - let result = unsafe { super::mds_neon_w12(&raw) }; - - for i in 0..12 { - assert_eq!( - F::new(result[i]).as_canonical_u64(), - expected[i].as_canonical_u64(), - "NEON MDS w12 boundary mismatch at index {i} for value {val}" - ); - } - } - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs deleted file mode 100644 index 82516a6cf..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/mod.rs +++ /dev/null @@ -1,12 +0,0 @@ -mod mds; -mod packing; -mod poseidon1; -mod poseidon1_asm; -mod poseidon2; -mod poseidon2_asm; -mod utils; - -pub use mds::MdsNeonGoldilocks; -pub use packing::*; -pub use poseidon1::*; -pub use poseidon2::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs deleted file mode 100644 index f393c3b65..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/packing.rs +++ /dev/null @@ -1,404 +0,0 @@ -use alloc::vec::Vec; -use core::arch::aarch64::{ - uint64x2_t, vaddq_u64, vandq_u64, vbicq_u64, vcgtq_s64, vdupq_n_u64, veorq_u64, vgetq_lane_u64, - vreinterpretq_s64_u64, vsetq_lane_u64, vshrq_n_u64, vsubq_u64, -}; -use core::fmt::Debug; -use core::iter::{Product, Sum}; -use core::mem::transmute; -use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; - -use p3_field::exponentiation::exp_10540996611094048183; -use p3_field::op_assign_macros::{ - impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, - impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, - ring_sum, -}; -use p3_field::{ - Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, - PermutationMonomial, PrimeCharacteristicRing, PrimeField64, -}; -use p3_util::reconstitute_from_base; -use rand::distr::{Distribution, StandardUniform}; -use rand::{Rng, RngExt}; - -use crate::{Goldilocks, P}; - -const WIDTH: usize = 2; - -/// Equal to `2^32 - 1 = 2^64 mod P`. -const EPSILON: u64 = Goldilocks::ORDER_U64.wrapping_neg(); - -/// Vectorized NEON implementation of `Goldilocks` arithmetic. -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] -#[repr(transparent)] -#[must_use] -pub struct PackedGoldilocksNeon(pub [Goldilocks; WIDTH]); - -impl PackedGoldilocksNeon { - #[inline] - #[must_use] - pub(crate) fn to_vector(self) -> uint64x2_t { - unsafe { transmute(self) } - } - - #[inline] - pub(crate) fn from_vector(vector: uint64x2_t) -> Self { - unsafe { transmute(vector) } - } - - #[inline] - const fn broadcast(value: Goldilocks) -> Self { - Self([value; WIDTH]) - } -} - -impl From for PackedGoldilocksNeon { - fn from(x: Goldilocks) -> Self { - Self::broadcast(x) - } -} - -impl Add for PackedGoldilocksNeon { - type Output = Self; - #[inline] - fn add(self, rhs: Self) -> Self { - Self::from_vector(add(self.to_vector(), rhs.to_vector())) - } -} - -impl Sub for PackedGoldilocksNeon { - type Output = Self; - #[inline] - fn sub(self, rhs: Self) -> Self { - Self::from_vector(sub(self.to_vector(), rhs.to_vector())) - } -} - -impl Neg for PackedGoldilocksNeon { - type Output = Self; - #[inline] - fn neg(self) -> Self { - Self::from_vector(neg(self.to_vector())) - } -} - -impl Mul for PackedGoldilocksNeon { - type Output = Self; - #[inline] - fn mul(self, rhs: Self) -> Self { - Self::from_vector(mul(self.to_vector(), rhs.to_vector())) - } -} - -impl_add_assign!(PackedGoldilocksNeon); -impl_sub_assign!(PackedGoldilocksNeon); -impl_mul_methods!(PackedGoldilocksNeon); -ring_sum!(PackedGoldilocksNeon); -impl_rng!(PackedGoldilocksNeon); - -impl PrimeCharacteristicRing for PackedGoldilocksNeon { - type PrimeSubfield = Goldilocks; - - const ZERO: Self = Self::broadcast(Goldilocks::ZERO); - const ONE: Self = Self::broadcast(Goldilocks::ONE); - const TWO: Self = Self::broadcast(Goldilocks::TWO); - const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE); - - #[inline] - fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { - f.into() - } - - #[inline] - fn halve(&self) -> Self { - Self::from_vector(halve(self.to_vector())) - } - - #[inline] - fn square(&self) -> Self { - Self::from_vector(square(self.to_vector())) - } - - #[inline] - fn zero_vec(len: usize) -> Vec { - unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) } - } -} - -impl InjectiveMonomial<7> for PackedGoldilocksNeon {} - -impl PermutationMonomial<7> for PackedGoldilocksNeon { - fn injective_exp_root_n(&self) -> Self { - exp_10540996611094048183(*self) - } -} - -impl_add_base_field!(PackedGoldilocksNeon, Goldilocks); -impl_sub_base_field!(PackedGoldilocksNeon, Goldilocks); -impl_mul_base_field!(PackedGoldilocksNeon, Goldilocks); -impl_div_methods!(PackedGoldilocksNeon, Goldilocks); -impl_sum_prod_base_field!(PackedGoldilocksNeon, Goldilocks); - -impl Algebra for PackedGoldilocksNeon { - // Benchmarked on AArch64 NEON: chunk=2 ≈ 182ns, chunk=4 ≈ 198ns, chunk=8 ≈ 221ns. - const BATCHED_LC_CHUNK: usize = 2; -} - -impl_packed_value!(PackedGoldilocksNeon, Goldilocks, WIDTH); - -unsafe impl PackedField for PackedGoldilocksNeon { - type Scalar = Goldilocks; -} - -/// Interleave two 64-bit vectors at the element level. -/// For block_len=1: [a0, a1] x [b0, b1] -> [a0, b0], [a1, b1] -#[inline] -pub fn interleave_u64(v0: uint64x2_t, v1: uint64x2_t) -> (uint64x2_t, uint64x2_t) { - unsafe { - let a0 = vgetq_lane_u64::<0>(v0); - let a1 = vgetq_lane_u64::<1>(v0); - let b0 = vgetq_lane_u64::<0>(v1); - let b1 = vgetq_lane_u64::<1>(v1); - - // r0 = [a0, b0], r1 = [a1, b1] - let r0 = vsetq_lane_u64::<1>(b0, vsetq_lane_u64::<0>(a0, vdupq_n_u64(0))); - let r1 = vsetq_lane_u64::<1>(b1, vsetq_lane_u64::<0>(a1, vdupq_n_u64(0))); - - (r0, r1) - } -} - -unsafe impl PackedFieldPow2 for PackedGoldilocksNeon { - fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) { - let (v0, v1) = (self.to_vector(), other.to_vector()); - let (res0, res1) = match block_len { - 1 => interleave_u64(v0, v1), - 2 => (v0, v1), - _ => panic!("unsupported block length"), - }; - (Self::from_vector(res0), Self::from_vector(res1)) - } -} - -// NEON arithmetic uses shifted representation (XOR with 2^63) for unsigned comparison. - -const SIGN_BIT: uint64x2_t = unsafe { transmute([i64::MIN as u64; WIDTH]) }; -const SHIFTED_FIELD_ORDER: uint64x2_t = - unsafe { transmute([Goldilocks::ORDER_U64 ^ (i64::MIN as u64); WIDTH]) }; -const EPSILON_VEC: uint64x2_t = unsafe { transmute([EPSILON; WIDTH]) }; - -#[inline(always)] -fn shift(x: uint64x2_t) -> uint64x2_t { - unsafe { veorq_u64(x, SIGN_BIT) } -} - -#[inline(always)] -unsafe fn canonicalize_s(x_s: uint64x2_t) -> uint64x2_t { - unsafe { - let x_s_signed = vreinterpretq_s64_u64(x_s); - let order_s_signed = vreinterpretq_s64_u64(SHIFTED_FIELD_ORDER); - let mask = vcgtq_s64(order_s_signed, x_s_signed); - let wrapback_amt = vbicq_u64(EPSILON_VEC, mask); - vaddq_u64(x_s, wrapback_amt) - } -} - -#[inline(always)] -unsafe fn add_no_double_overflow_64_64s_s(x: uint64x2_t, y_s: uint64x2_t) -> uint64x2_t { - unsafe { - let res_wrapped_s = vaddq_u64(x, y_s); - // After XOR shift, signed comparison correctly detects overflow. - // Overflow occurred iff y_s > res_wrapped_s (as signed, due to shift semantics) - let y_s_signed = vreinterpretq_s64_u64(y_s); - let res_s_signed = vreinterpretq_s64_u64(res_wrapped_s); - let mask = vcgtq_s64(y_s_signed, res_s_signed); - // wrapback_amt is EPSILON on overflow - let wrapback_amt = vshrq_n_u64::<32>(mask); - vaddq_u64(res_wrapped_s, wrapback_amt) - } -} - -/// Goldilocks modular addition. -#[inline] -fn add(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t { - unsafe { - let y_s = shift(y); - let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s)); - shift(res_s) - } -} - -/// Goldilocks modular subtraction. -#[inline] -fn sub(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t { - unsafe { - let mut y_s = shift(y); - y_s = canonicalize_s(y_s); - let x_s = shift(x); - let y_s_signed = vreinterpretq_s64_u64(y_s); - let x_s_signed = vreinterpretq_s64_u64(x_s); - // -1 if underflow (y > x) - let mask = vcgtq_s64(y_s_signed, x_s_signed); - let wrapback_amt = vshrq_n_u64::<32>(mask); - let res_wrapped = vsubq_u64(x_s, y_s); - vsubq_u64(res_wrapped, wrapback_amt) - } -} - -/// Goldilocks modular negation. -#[inline] -fn neg(y: uint64x2_t) -> uint64x2_t { - unsafe { - let y_s = shift(y); - vsubq_u64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s)) - } -} - -/// Halve a vector of Goldilocks field elements. -#[inline(always)] -pub(crate) fn halve(input: uint64x2_t) -> uint64x2_t { - unsafe { - let one = vdupq_n_u64(1); - let zero = vdupq_n_u64(0); - let half = vdupq_n_u64(P.div_ceil(2)); - - let least_bit = vandq_u64(input, one); - let t = vshrq_n_u64::<1>(input); - // neg_least_bit is 0 or -1 (all bits 1) - let neg_least_bit = vsubq_u64(zero, least_bit); - let maybe_half = vandq_u64(half, neg_least_bit); - vaddq_u64(t, maybe_half) - } -} - -/// Goldilocks modular multiplication using interleaved dual-lane ASM. -#[inline] -fn mul(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t { - unsafe { - let x0 = vgetq_lane_u64::<0>(x); - let x1 = vgetq_lane_u64::<1>(x); - let y0 = vgetq_lane_u64::<0>(y); - let y1 = vgetq_lane_u64::<1>(y); - - let (res_0, res_1) = mul_reduce_dual_asm(x0, y0, x1, y1); - - transmute([res_0, res_1]) - } -} - -/// Interleaved dual-lane multiplication and reduction using scalar ASM. -/// Uses shift-based EPSILON multiplication: hi_lo * EPSILON = (hi_lo << 32) - hi_lo -#[inline(always)] -unsafe fn mul_reduce_dual_asm(a0: u64, b0: u64, a1: u64, b1: u64) -> (u64, u64) { - use core::arch::asm; - let result0: u64; - let result1: u64; - - unsafe { - asm!( - // Compute both 128-bit products (interleaved for ILP) - "mul {lo0}, {a0}, {b0}", - "mul {lo1}, {a1}, {b1}", - "umulh {hi0}, {a0}, {b0}", - "umulh {hi1}, {a1}, {b1}", - - // hi_hi = hi >> 32 - "lsr {hi_hi0}, {hi0}, #32", - "lsr {hi_hi1}, {hi1}, #32", - - // tmp = lo - hi_hi (with borrow handling) - "subs {tmp0}, {lo0}, {hi_hi0}", - "csetm {adj0:w}, cc", - "subs {tmp1}, {lo1}, {hi_hi1}", - "csetm {adj1:w}, cc", - "sub {tmp0}, {tmp0}, {adj0}", - "sub {tmp1}, {tmp1}, {adj1}", - - // hi_lo = hi & EPSILON - "and {hi_lo0}, {hi0}, {epsilon}", - "and {hi_lo1}, {hi1}, {epsilon}", - - // hi_lo_eps = (hi_lo << 32) - hi_lo (avoids multiply) - "lsl {t0}, {hi_lo0}, #32", - "lsl {t1}, {hi_lo1}, #32", - "sub {hi_lo_eps0}, {t0}, {hi_lo0}", - "sub {hi_lo_eps1}, {t1}, {hi_lo1}", - - // result = tmp + hi_lo_eps (with overflow handling) - "adds {result0}, {tmp0}, {hi_lo_eps0}", - "csetm {adj0:w}, cs", - "adds {result1}, {tmp1}, {hi_lo_eps1}", - "csetm {adj1:w}, cs", - "add {result0}, {result0}, {adj0}", - "add {result1}, {result1}, {adj1}", - - a0 = in(reg) a0, - b0 = in(reg) b0, - a1 = in(reg) a1, - b1 = in(reg) b1, - epsilon = in(reg) EPSILON, - lo0 = out(reg) _, - lo1 = out(reg) _, - hi0 = out(reg) _, - hi1 = out(reg) _, - hi_hi0 = out(reg) _, - hi_hi1 = out(reg) _, - tmp0 = out(reg) _, - tmp1 = out(reg) _, - hi_lo0 = out(reg) _, - hi_lo1 = out(reg) _, - t0 = out(reg) _, - t1 = out(reg) _, - hi_lo_eps0 = out(reg) _, - hi_lo_eps1 = out(reg) _, - adj0 = out(reg) _, - adj1 = out(reg) _, - result0 = out(reg) result0, - result1 = out(reg) result1, - options(pure, nomem, nostack), - ); - } - - (result0, result1) -} - -/// Goldilocks modular square using interleaved dual-lane ASM. -#[inline] -fn square(x: uint64x2_t) -> uint64x2_t { - unsafe { - let x0 = vgetq_lane_u64::<0>(x); - let x1 = vgetq_lane_u64::<1>(x); - - let (res_0, res_1) = mul_reduce_dual_asm(x0, x0, x1, x1); - - transmute([res_0, res_1]) - } -} - -#[cfg(test)] -mod tests { - use p3_field_testing::test_packed_field; - - use super::{Goldilocks, PackedGoldilocksNeon, WIDTH}; - - const SPECIAL_VALS: [Goldilocks; WIDTH] = - Goldilocks::new_array([0xFFFF_FFFF_0000_0000, 0xFFFF_FFFF_FFFF_FFFF]); - - const ZEROS: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([ - 0x0000_0000_0000_0000, - 0xFFFF_FFFF_0000_0001, // = P, canonicalizes to 0 - ])); - - const ONES: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([ - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0002, // = P + 1, canonicalizes to 1 - ])); - - test_packed_field!( - crate::PackedGoldilocksNeon, - &[super::ZEROS], - &[super::ONES], - crate::PackedGoldilocksNeon(super::SPECIAL_VALS) - ); -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs deleted file mode 100644 index 0a877578a..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1.rs +++ /dev/null @@ -1,716 +0,0 @@ -//! Fused Poseidon1 permutation for Goldilocks on aarch64. - -use alloc::vec::Vec; - -use p3_poseidon1::{ - FullRoundConstants, PartialRoundConstants, full_round_initial_permute_state, - full_round_terminal_permute_state, partial_permute_state, -}; -use p3_symmetric::{CryptographicPermutation, Permutation}; - -use super::mds::{MdsNeonGoldilocks, mds_neon_w8, mds_neon_w12}; -use super::packing::PackedGoldilocksNeon; -use super::poseidon1_asm::*; -use super::poseidon2_asm::{sbox_layer_asm, sbox_layer_dual_asm}; -use super::utils::{pack_lanes, unpack_lanes}; -use crate::Goldilocks; - -/// Fused Poseidon1 permutation for Goldilocks. -/// -/// Holds the pre-extracted raw `u64` constants from the optimized Poseidon1 -/// sparse-matrix decomposition. Storing raw values avoids field-element -/// overhead in the hot inner loop. -#[derive(Clone, Debug)] -pub struct Poseidon1GoldilocksFused { - /// Round constants for the initial full rounds (RF/2 vectors). - initial_constants_raw: Vec<[u64; WIDTH]>, - /// Round constants for the terminal full rounds (RF/2 vectors). - terminal_constants_raw: Vec<[u64; WIDTH]>, - /// Full-width constant vector for the first partial round. - first_round_constants_raw: [u64; WIDTH], - /// Dense transition matrix applied once before entering the partial-round loop. - m_i_raw: [[u64; WIDTH]; WIDTH], - /// Per-round first row of the sparse matrix (one per partial round). - sparse_first_row_raw: Vec<[u64; WIDTH]>, - /// Per-round sub-diagonal vector for the sparse matmul (one per partial round). - v_raw: Vec<[u64; WIDTH]>, - /// Scalar round constants for partial rounds 0 through RP-2. - /// - /// The last partial round has no scalar constant (it ends with the S-box only). - round_constants_raw: Vec, -} - -impl Poseidon1GoldilocksFused { - /// Create from pre-computed full and partial round constants. - /// - /// Extracts the raw `u64` representation from each Goldilocks field - /// element, building the flat arrays that the ASM kernels consume. - pub fn new( - full: &FullRoundConstants, - partial: &PartialRoundConstants, - ) -> Self { - // Extract raw u64 values from full-round constant matrices. - let initial_constants_raw = full - .initial - .iter() - .map(|rc| core::array::from_fn(|i| rc[i].value)) - .collect(); - let terminal_constants_raw = full - .terminal - .iter() - .map(|rc| core::array::from_fn(|i| rc[i].value)) - .collect(); - - // Extract the first partial-round constant vector. - let first_round_constants_raw = - core::array::from_fn(|i| partial.first_round_constants[i].value); - - // Extract the dense transition matrix. - let m_i_raw = core::array::from_fn(|i| core::array::from_fn(|j| partial.m_i[i][j].value)); - - // Extract per-round sparse matrix data. - let sparse_first_row_raw = partial - .sparse_first_row - .iter() - .map(|r| core::array::from_fn(|i| r[i].value)) - .collect(); - let v_raw = partial - .v - .iter() - .map(|r| core::array::from_fn(|i| r[i].value)) - .collect(); - - // Extract scalar round constants for partial rounds. - let round_constants_raw = partial.round_constants.iter().map(|c| c.value).collect(); - - Self { - initial_constants_raw, - terminal_constants_raw, - first_round_constants_raw, - m_i_raw, - sparse_first_row_raw, - v_raw, - round_constants_raw, - } - } -} - -/// Run the initial or terminal full rounds on a raw width-8 state. -/// -/// Each full round applies: add constants, S-box on all elements, NEON MDS. -#[inline] -fn full_rounds_scalar_w8(raw: &mut [u64; 8], constants: &[[u64; 8]]) { - for rc in constants { - unsafe { - add_rc_asm(raw, rc); - sbox_layer_asm(raw); - } - *raw = unsafe { mds_neon_w8(raw) }; - } -} - -/// Run the initial or terminal full rounds on a raw width-12 state. -/// -/// Each full round applies: add constants, S-box on all elements, NEON MDS. -#[inline] -fn full_rounds_scalar_w12(raw: &mut [u64; 12], constants: &[[u64; 12]]) { - for rc in constants { - unsafe { - add_rc_asm(raw, rc); - sbox_layer_asm(raw); - } - *raw = unsafe { mds_neon_w12(raw) }; - } -} - -/// Run all partial rounds on a raw width-8 state. -/// -/// The partial-round sequence is: -/// 1. Add the first-round full-width constant vector. -/// 2. Apply the dense transition matrix once. -/// 3. For each partial round (except the last): -/// S-box on first element, add scalar constant, sparse matmul. -/// 4. Last partial round: S-box on first element, sparse matmul (no constant). -#[inline] -fn partial_rounds_scalar_w8( - raw: &mut [u64; 8], - first_rc: &[u64; 8], - m_i: &[[u64; 8]; 8], - sparse_first_row: &[[u64; 8]], - v: &[[u64; 8]], - round_constants: &[u64], -) { - // Add the first-round full-width constant vector. - unsafe { - add_rc_asm(raw, first_rc); - } - - // Apply the dense transition matrix once. - dense_matmul_asm_w8(raw, m_i); - - // Main partial-round loop: S-box + scalar constant + sparse matmul. - let rounds_p = sparse_first_row.len(); - for r in 0..rounds_p - 1 { - unsafe { - sbox_s0_asm(raw); - add_scalar_s0_asm(raw, round_constants[r]); - cheap_matmul_asm_w8(raw, &sparse_first_row[r], &v[r]); - } - } - - // Last partial round: no scalar constant. - unsafe { - sbox_s0_asm(raw); - cheap_matmul_asm_w8(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]); - } -} - -/// Run all partial rounds on a raw width-12 state. -/// -/// Same structure as the width-8 variant. -#[inline] -fn partial_rounds_scalar_w12( - raw: &mut [u64; 12], - first_rc: &[u64; 12], - m_i: &[[u64; 12]; 12], - sparse_first_row: &[[u64; 12]], - v: &[[u64; 12]], - round_constants: &[u64], -) { - unsafe { - add_rc_asm(raw, first_rc); - } - dense_matmul_asm_w12(raw, m_i); - - let rounds_p = sparse_first_row.len(); - for r in 0..rounds_p - 1 { - unsafe { - sbox_s0_asm(raw); - add_scalar_s0_asm(raw, round_constants[r]); - cheap_matmul_asm_w12(raw, &sparse_first_row[r], &v[r]); - } - } - unsafe { - sbox_s0_asm(raw); - cheap_matmul_asm_w12(raw, &sparse_first_row[rounds_p - 1], &v[rounds_p - 1]); - } -} - -/// Run the initial or terminal full rounds on two raw width-8 lanes. -/// -/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane. -#[inline] -fn full_rounds_dual_w8(lane0: &mut [u64; 8], lane1: &mut [u64; 8], constants: &[[u64; 8]]) { - for rc in constants { - unsafe { - add_rc_dual_asm(lane0, lane1, rc); - sbox_layer_dual_asm(lane0, lane1); - } - *lane0 = unsafe { mds_neon_w8(lane0) }; - *lane1 = unsafe { mds_neon_w8(lane1) }; - } -} - -/// Run the initial or terminal full rounds on two raw width-12 lanes. -/// -/// Uses dual-lane ASM primitives for add_rc and S-box, then NEON MDS per lane. -#[inline] -fn full_rounds_dual_w12(lane0: &mut [u64; 12], lane1: &mut [u64; 12], constants: &[[u64; 12]]) { - for rc in constants { - unsafe { - add_rc_dual_asm(lane0, lane1, rc); - sbox_layer_dual_asm(lane0, lane1); - } - *lane0 = unsafe { mds_neon_w12(lane0) }; - *lane1 = unsafe { mds_neon_w12(lane1) }; - } -} - -/// Run all partial rounds on two width-8 lanes simultaneously. -/// -/// Uses dual-lane S-box and sparse matmul primitives to keep the -/// pipeline full. The scalar constant is added to each lane separately -/// (no dual variant needed for a single-element addition). -#[inline] -fn partial_rounds_dual_w8( - lane0: &mut [u64; 8], - lane1: &mut [u64; 8], - first_rc: &[u64; 8], - m_i: &[[u64; 8]; 8], - sparse_first_row: &[[u64; 8]], - v: &[[u64; 8]], - round_constants: &[u64], -) { - // Add the first-round constant to both lanes. - unsafe { - add_rc_dual_asm(lane0, lane1, first_rc); - } - - // Dense transition matrix on both lanes. - dense_matmul_dual_asm_w8(lane0, lane1, m_i); - - // Main partial-round loop. - let rounds_p = sparse_first_row.len(); - for r in 0..rounds_p - 1 { - unsafe { - sbox_s0_dual_asm(lane0, lane1); - add_scalar_s0_asm(lane0, round_constants[r]); - add_scalar_s0_asm(lane1, round_constants[r]); - cheap_matmul_dual_asm_w8(lane0, lane1, &sparse_first_row[r], &v[r]); - } - } - - // Last partial round: no scalar constant. - unsafe { - sbox_s0_dual_asm(lane0, lane1); - cheap_matmul_dual_asm_w8( - lane0, - lane1, - &sparse_first_row[rounds_p - 1], - &v[rounds_p - 1], - ); - } -} - -/// Run all partial rounds on two width-12 lanes simultaneously. -/// -/// Same structure as the width-8 dual variant. -#[inline] -fn partial_rounds_dual_w12( - lane0: &mut [u64; 12], - lane1: &mut [u64; 12], - first_rc: &[u64; 12], - m_i: &[[u64; 12]; 12], - sparse_first_row: &[[u64; 12]], - v: &[[u64; 12]], - round_constants: &[u64], -) { - unsafe { - add_rc_dual_asm(lane0, lane1, first_rc); - } - dense_matmul_dual_asm_w12(lane0, lane1, m_i); - - let rounds_p = sparse_first_row.len(); - for r in 0..rounds_p - 1 { - unsafe { - sbox_s0_dual_asm(lane0, lane1); - add_scalar_s0_asm(lane0, round_constants[r]); - add_scalar_s0_asm(lane1, round_constants[r]); - cheap_matmul_dual_asm_w12(lane0, lane1, &sparse_first_row[r], &v[r]); - } - } - unsafe { - sbox_s0_dual_asm(lane0, lane1); - cheap_matmul_dual_asm_w12( - lane0, - lane1, - &sparse_first_row[rounds_p - 1], - &v[rounds_p - 1], - ); - } -} - -impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> { - fn permute_mut(&self, state: &mut [Goldilocks; 8]) { - // Zero-cost transmute: Goldilocks is repr(transparent) over u64. - let raw = unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; - - // Initial full rounds, then partial rounds, then terminal full rounds. - full_rounds_scalar_w8(raw, &self.initial_constants_raw); - partial_rounds_scalar_w8( - raw, - &self.first_round_constants_raw, - &self.m_i_raw, - &self.sparse_first_row_raw, - &self.v_raw, - &self.round_constants_raw, - ); - full_rounds_scalar_w8(raw, &self.terminal_constants_raw); - } -} - -impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksFused<8> {} - -impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) { - // Unpack the two lanes from the packed representation. - let (mut lane0, mut lane1) = unpack_lanes(state); - - // Run the full permutation on both lanes simultaneously. - full_rounds_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw); - partial_rounds_dual_w8( - &mut lane0, - &mut lane1, - &self.first_round_constants_raw, - &self.m_i_raw, - &self.sparse_first_row_raw, - &self.v_raw, - &self.round_constants_raw, - ); - full_rounds_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw); - - // Repack both lanes into the packed representation. - pack_lanes(state, &lane0, &lane1); - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksFused<8> {} - -impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> { - fn permute_mut(&self, state: &mut [Goldilocks; 12]) { - let raw = unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; - - full_rounds_scalar_w12(raw, &self.initial_constants_raw); - partial_rounds_scalar_w12( - raw, - &self.first_round_constants_raw, - &self.m_i_raw, - &self.sparse_first_row_raw, - &self.v_raw, - &self.round_constants_raw, - ); - full_rounds_scalar_w12(raw, &self.terminal_constants_raw); - } -} - -impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksFused<12> {} - -impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - - full_rounds_dual_w12(&mut lane0, &mut lane1, &self.initial_constants_raw); - partial_rounds_dual_w12( - &mut lane0, - &mut lane1, - &self.first_round_constants_raw, - &self.m_i_raw, - &self.sparse_first_row_raw, - &self.v_raw, - &self.round_constants_raw, - ); - full_rounds_dual_w12(&mut lane0, &mut lane1, &self.terminal_constants_raw); - - pack_lanes(state, &lane0, &lane1); - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksFused<12> {} - -/// Dual-dispatch wrapper for Goldilocks Poseidon1. -/// -/// **Scalar** permutations use the NEON-accelerated MDS for full rounds -/// and LLVM-optimized sparse matrix decomposition for partial rounds. -/// This avoids sequential inline ASM that would prevent LLVM's -/// instruction scheduling optimizations on wide out-of-order cores. -/// -/// **Packed** permutations delegate to the fused dual-lane ASM path -/// with NEON MDS for full rounds and sparse matrix for partial rounds -/// (dual-lane interleaving hides multiply latency). -#[derive(Clone, Debug)] -pub struct Poseidon1GoldilocksDispatch { - /// Fused dual-lane path — used for packed permutations. - fused: Poseidon1GoldilocksFused, - /// Pre-computed full round constants for NEON MDS. - full_constants: FullRoundConstants, - /// Pre-computed partial round constants (textbook path for scalar, sparse for packed). - partial_constants: PartialRoundConstants, -} - -impl Poseidon1GoldilocksDispatch { - /// Create from fused and pre-computed constants. - pub const fn new( - fused: Poseidon1GoldilocksFused, - full_constants: FullRoundConstants, - partial_constants: PartialRoundConstants, - ) -> Self { - Self { - fused, - full_constants, - partial_constants, - } - } -} - -// --- Width 8 --- - -impl Permutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> { - fn permute_mut(&self, state: &mut [Goldilocks; 8]) { - let mds = MdsNeonGoldilocks; - full_round_initial_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds); - partial_permute_state::<_, _, 8, 7>(state, &self.partial_constants); - full_round_terminal_permute_state::<_, _, _, 8, 7>(state, &self.full_constants, &mds); - } -} - -impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon1GoldilocksDispatch<8> {} - -impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) { - self.fused.permute_mut(state); - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon1GoldilocksDispatch<8> {} - -// --- Width 12 --- - -impl Permutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> { - fn permute_mut(&self, state: &mut [Goldilocks; 12]) { - let mds = MdsNeonGoldilocks; - full_round_initial_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds); - partial_permute_state::<_, _, 12, 7>(state, &self.partial_constants); - full_round_terminal_permute_state::<_, _, _, 12, 7>(state, &self.full_constants, &mds); - } -} - -impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon1GoldilocksDispatch<12> {} - -impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) { - // Extract both lanes, run the optimized scalar path on each, repack. - // Directly inline the scalar logic (NEON MDS full rounds + sparse partial - // rounds) to avoid trait-dispatch overhead and enable cross-call inlining. - let mut lane0: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[0]); - let mut lane1: [Goldilocks; 12] = core::array::from_fn(|i| state[i].0[1]); - - let mds = MdsNeonGoldilocks; - full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds); - partial_permute_state::<_, _, 12, 7>(&mut lane0, &self.partial_constants); - full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane0, &self.full_constants, &mds); - - full_round_initial_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds); - partial_permute_state::<_, _, 12, 7>(&mut lane1, &self.partial_constants); - full_round_terminal_permute_state::<_, _, _, 12, 7>(&mut lane1, &self.full_constants, &mds); - - for i in 0..12 { - state[i] = PackedGoldilocksNeon([lane0[i], lane1[i]]); - } - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon1GoldilocksDispatch<12> {} - -#[cfg(test)] -mod tests { - use p3_field::{PrimeCharacteristicRing, PrimeField64}; - use p3_poseidon1::Poseidon1Constants; - use p3_symmetric::Permutation; - use rand::rngs::SmallRng; - use rand::{RngExt, SeedableRng}; - - use super::*; - use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL}; - use crate::poseidon1::{ - GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, GOLDILOCKS_POSEIDON1_RC_8, - GOLDILOCKS_POSEIDON1_RC_12, default_goldilocks_poseidon1_8, - default_goldilocks_poseidon1_12, - }; - - type F = Goldilocks; - - /// Build a width-8 fused permutation from the fixed round constants. - fn make_fused_w8() -> Poseidon1GoldilocksFused<8> { - let raw = Poseidon1Constants { - rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - mds_circ_col: MATRIX_CIRC_MDS_8_COL, - round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(), - }; - let (full, partial) = raw.to_optimized(); - Poseidon1GoldilocksFused::new(&full, &partial) - } - - /// Build a width-12 fused permutation from the fixed round constants. - fn make_fused_w12() -> Poseidon1GoldilocksFused<12> { - let raw = Poseidon1Constants { - rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, - mds_circ_col: MATRIX_CIRC_MDS_12_COL, - round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(), - }; - let (full, partial) = raw.to_optimized(); - Poseidon1GoldilocksFused::new(&full, &partial) - } - - /// Verify that the fused width-8 implementation matches the generic one - /// on both zero and random inputs. - #[test] - fn test_fused_matches_generic_w8() { - let generic = default_goldilocks_poseidon1_8(); - let fused = make_fused_w8(); - let mut rng = SmallRng::seed_from_u64(42); - - // Zero input. - let mut g_state = [F::ZERO; 8]; - let mut f_state = [F::ZERO; 8]; - generic.permute_mut(&mut g_state); - fused.permute_mut(&mut f_state); - for i in 0..8 { - assert_eq!( - f_state[i].as_canonical_u64(), - g_state[i].as_canonical_u64(), - "Fused vs generic mismatch at index {i} (zero input, w8)" - ); - } - - // Random input. - let mut g_state: [F; 8] = rng.random(); - let mut f_state = g_state; - generic.permute_mut(&mut g_state); - fused.permute_mut(&mut f_state); - for i in 0..8 { - assert_eq!( - f_state[i].as_canonical_u64(), - g_state[i].as_canonical_u64(), - "Fused vs generic mismatch at index {i} (random input, w8)" - ); - } - } - - /// Same fused-vs-generic verification for width 12. - #[test] - fn test_fused_matches_generic_w12() { - let generic = default_goldilocks_poseidon1_12(); - let fused = make_fused_w12(); - let mut rng = SmallRng::seed_from_u64(42); - - let mut g_state = [F::ZERO; 12]; - let mut f_state = [F::ZERO; 12]; - generic.permute_mut(&mut g_state); - fused.permute_mut(&mut f_state); - for i in 0..12 { - assert_eq!( - f_state[i].as_canonical_u64(), - g_state[i].as_canonical_u64(), - "Fused vs generic mismatch at index {i} (zero input, w12)" - ); - } - - let mut g_state: [F; 12] = rng.random(); - let mut f_state = g_state; - generic.permute_mut(&mut g_state); - fused.permute_mut(&mut f_state); - for i in 0..12 { - assert_eq!( - f_state[i].as_canonical_u64(), - g_state[i].as_canonical_u64(), - "Fused vs generic mismatch at index {i} (random input, w12)" - ); - } - } - - /// Verify that the packed (dual-lane) width-8 path matches running - /// two independent scalar permutations. - #[test] - fn test_packed_matches_scalar_w8() { - let fused = make_fused_w8(); - let mut rng = SmallRng::seed_from_u64(123); - - // Two independent random scalar inputs. - let scalar_a: [F; 8] = rng.random(); - let scalar_b: [F; 8] = rng.random(); - - // Pack them into a single packed state and permute. - let mut packed: [PackedGoldilocksNeon; 8] = - core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]])); - fused.permute_mut(&mut packed); - - // Compute the expected result by running scalar on each independently. - let mut expected_a = scalar_a; - let mut expected_b = scalar_b; - fused.permute_mut(&mut expected_a); - fused.permute_mut(&mut expected_b); - - // Lane 0 must match the first scalar, lane 1 must match the second. - for i in 0..8 { - assert_eq!( - packed[i].0[0].as_canonical_u64(), - expected_a[i].as_canonical_u64(), - "Packed lane0 mismatch at index {i} (w8)" - ); - assert_eq!( - packed[i].0[1].as_canonical_u64(), - expected_b[i].as_canonical_u64(), - "Packed lane1 mismatch at index {i} (w8)" - ); - } - } - - /// Same packed-vs-scalar verification for width 12. - #[test] - fn test_packed_matches_scalar_w12() { - let fused = make_fused_w12(); - let mut rng = SmallRng::seed_from_u64(123); - - let scalar_a: [F; 12] = rng.random(); - let scalar_b: [F; 12] = rng.random(); - - let mut packed: [PackedGoldilocksNeon; 12] = - core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]])); - fused.permute_mut(&mut packed); - - let mut expected_a = scalar_a; - let mut expected_b = scalar_b; - fused.permute_mut(&mut expected_a); - fused.permute_mut(&mut expected_b); - - for i in 0..12 { - assert_eq!( - packed[i].0[0].as_canonical_u64(), - expected_a[i].as_canonical_u64(), - "Packed lane0 mismatch at index {i} (w12)" - ); - assert_eq!( - packed[i].0[1].as_canonical_u64(), - expected_b[i].as_canonical_u64(), - "Packed lane1 mismatch at index {i} (w12)" - ); - } - } - - /// Known-answer test for width 8 (sequential 0..7 input). - #[test] - fn test_fused_kat_w8() { - let fused = make_fused_w8(); - let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]); - fused.permute_mut(&mut input); - - let expected: [F; 8] = F::new_array([ - 2431226948502761687, - 9427563026145807618, - 6827549936272051660, - 16907684411084503785, - 10131745626715172913, - 17448305483431576765, - 9066501914269485014, - 12095238468458521303, - ]); - assert_eq!(input, expected); - } - - /// Known-answer test for width 12 (sequential 0..11 input). - #[test] - fn test_fused_kat_w12() { - let fused = make_fused_w12(); - let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); - fused.permute_mut(&mut input); - - let expected: [F; 12] = F::new_array([ - 15595088881848875364, - 9564850329150784619, - 13607005230761744521, - 12117102595842533385, - 2814257411756993122, - 11640647689983397089, - 14363867760831937423, - 13323891071259596526, - 11219803511311150468, - 9221595262780869902, - 5898229059046891887, - 18181291031484020550, - ]); - assert_eq!(input, expected); - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs deleted file mode 100644 index 3ca1382a9..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon1_asm.rs +++ /dev/null @@ -1,843 +0,0 @@ -//! ARM assembly primitives for the Poseidon1 permutation over Goldilocks. - -use super::utils::{add_asm, mul_add_asm, mul_asm}; - -// --------------------------------------------------------------------------- -// S-box: x -> x^7 (applied to the first element only) -// --------------------------------------------------------------------------- - -/// Apply the degree-7 S-box to the first element of the state. -/// -/// Computes `x^7` using four multiplications via the addition chain: -/// -/// ```text -/// x -> x^2 -> x^3 (= x^2 * x) -/// x^4 (= x^2 * x^2) -/// x^7 (= x^3 * x^4) -/// ``` -/// -/// Only the first element is modified. All other elements are unchanged. -/// This corresponds to the non-linear step of a **partial round**. -#[inline(always)] -pub unsafe fn sbox_s0_asm(state: &mut [u64]) { - unsafe { - // Load the first element. - let s0 = state[0]; - - // Square: x^2. - let s0_2 = mul_asm(s0, s0); - - // Cube: x^3 = x^2 * x. - let s0_3 = mul_asm(s0_2, s0); - - // Fourth power: x^4 = x^2 * x^2. - let s0_4 = mul_asm(s0_2, s0_2); - - // Seventh power: x^7 = x^3 * x^4. - state[0] = mul_asm(s0_3, s0_4); - } -} - -/// Dual-lane S-box on the first element of two independent states. -/// -/// Applies the same degree-7 S-box to both first elements. Interleaving -/// the two chains hides the multiplication latency: while one multiply -/// retires, the other is already in flight. -#[inline(always)] -pub unsafe fn sbox_s0_dual_asm(state0: &mut [u64], state1: &mut [u64]) { - unsafe { - // Load both first elements. - let a = state0[0]; - let b = state1[0]; - - // Square both. - let a2 = mul_asm(a, a); - let b2 = mul_asm(b, b); - - // Cube both: x^3 = x^2 * x. - let a3 = mul_asm(a2, a); - let b3 = mul_asm(b2, b); - - // Fourth power both: x^4 = x^2 * x^2. - let a4 = mul_asm(a2, a2); - let b4 = mul_asm(b2, b2); - - // Seventh power both: x^7 = x^3 * x^4. - state0[0] = mul_asm(a3, a4); - state1[0] = mul_asm(b3, b4); - } -} - -// --------------------------------------------------------------------------- -// Sparse matrix-vector multiply (partial-round linear layer) -// --------------------------------------------------------------------------- - -/// Sparse matrix-vector multiply for a width-8 state. -/// -/// Implements the partial-round linear layer. The sparse matrix is -/// encoded as its first row and a sub-diagonal vector: -/// -/// ```text -/// new[0] = dot(first_row, state) (dot product) -/// new[i] = state[i] + state[0] * v[i-1] (for i >= 1) -/// ``` -/// -/// The original first element is captured before the dot product -/// overwrites it. The unrolled form avoids loop overhead and gives -/// the scheduler maximum freedom to reorder independent multiply-adds. -#[inline(always)] -pub unsafe fn cheap_matmul_asm_w8(state: &mut [u64; 8], first_row: &[u64; 8], v: &[u64; 8]) { - unsafe { - // Capture the original first element before it gets overwritten. - let old_s0 = state[0]; - - // Dot product: accumulate dot(first_row, state). - let mut acc = mul_asm(state[0], first_row[0]); - acc = mul_add_asm(state[1], first_row[1], acc); - acc = mul_add_asm(state[2], first_row[2], acc); - acc = mul_add_asm(state[3], first_row[3], acc); - acc = mul_add_asm(state[4], first_row[4], acc); - acc = mul_add_asm(state[5], first_row[5], acc); - acc = mul_add_asm(state[6], first_row[6], acc); - acc = mul_add_asm(state[7], first_row[7], acc); - - // Tail update: each remaining element gets old_first * v[i-1] added. - state[1] = mul_add_asm(old_s0, v[0], state[1]); - state[2] = mul_add_asm(old_s0, v[1], state[2]); - state[3] = mul_add_asm(old_s0, v[2], state[3]); - state[4] = mul_add_asm(old_s0, v[3], state[4]); - state[5] = mul_add_asm(old_s0, v[4], state[5]); - state[6] = mul_add_asm(old_s0, v[5], state[6]); - state[7] = mul_add_asm(old_s0, v[6], state[7]); - - // Write the dot-product result into the first slot. - state[0] = acc; - } -} - -/// Sparse matrix-vector multiply for a width-12 state. -/// -/// Same decomposition as the width-8 variant: -/// - Dot product for the new first element. -/// - Scalar multiply-add for every other element. -#[inline(always)] -pub unsafe fn cheap_matmul_asm_w12(state: &mut [u64; 12], first_row: &[u64; 12], v: &[u64; 12]) { - unsafe { - // Capture the original first element before it gets overwritten. - let old_s0 = state[0]; - - // Dot product: accumulate dot(first_row, state). - let mut acc = mul_asm(state[0], first_row[0]); - acc = mul_add_asm(state[1], first_row[1], acc); - acc = mul_add_asm(state[2], first_row[2], acc); - acc = mul_add_asm(state[3], first_row[3], acc); - acc = mul_add_asm(state[4], first_row[4], acc); - acc = mul_add_asm(state[5], first_row[5], acc); - acc = mul_add_asm(state[6], first_row[6], acc); - acc = mul_add_asm(state[7], first_row[7], acc); - acc = mul_add_asm(state[8], first_row[8], acc); - acc = mul_add_asm(state[9], first_row[9], acc); - acc = mul_add_asm(state[10], first_row[10], acc); - acc = mul_add_asm(state[11], first_row[11], acc); - - // Tail update: each remaining element gets old_first * v[i-1] added. - state[1] = mul_add_asm(old_s0, v[0], state[1]); - state[2] = mul_add_asm(old_s0, v[1], state[2]); - state[3] = mul_add_asm(old_s0, v[2], state[3]); - state[4] = mul_add_asm(old_s0, v[3], state[4]); - state[5] = mul_add_asm(old_s0, v[4], state[5]); - state[6] = mul_add_asm(old_s0, v[5], state[6]); - state[7] = mul_add_asm(old_s0, v[6], state[7]); - state[8] = mul_add_asm(old_s0, v[7], state[8]); - state[9] = mul_add_asm(old_s0, v[8], state[9]); - state[10] = mul_add_asm(old_s0, v[9], state[10]); - state[11] = mul_add_asm(old_s0, v[10], state[11]); - - // Write the dot-product result into the first slot. - state[0] = acc; - } -} - -/// Dual-lane sparse matrix-vector multiply for a width-8 state. -/// -/// Processes two independent states through the same sparse matrix -/// simultaneously. Both lanes share the same first-row and sub-diagonal -/// vectors, since the matrix is fixed for a given partial round. -/// -/// Interleaving multiply-adds from both lanes keeps the pipeline full. -#[inline(always)] -pub unsafe fn cheap_matmul_dual_asm_w8( - s0: &mut [u64; 8], - s1: &mut [u64; 8], - first_row: &[u64; 8], - v: &[u64; 8], -) { - unsafe { - // Capture the original first elements from both lanes. - let old_a = s0[0]; - let old_b = s1[0]; - - // Dot products: one per lane, interleaved. - let mut acc_a = mul_asm(s0[0], first_row[0]); - let mut acc_b = mul_asm(s1[0], first_row[0]); - acc_a = mul_add_asm(s0[1], first_row[1], acc_a); - acc_b = mul_add_asm(s1[1], first_row[1], acc_b); - acc_a = mul_add_asm(s0[2], first_row[2], acc_a); - acc_b = mul_add_asm(s1[2], first_row[2], acc_b); - acc_a = mul_add_asm(s0[3], first_row[3], acc_a); - acc_b = mul_add_asm(s1[3], first_row[3], acc_b); - acc_a = mul_add_asm(s0[4], first_row[4], acc_a); - acc_b = mul_add_asm(s1[4], first_row[4], acc_b); - acc_a = mul_add_asm(s0[5], first_row[5], acc_a); - acc_b = mul_add_asm(s1[5], first_row[5], acc_b); - acc_a = mul_add_asm(s0[6], first_row[6], acc_a); - acc_b = mul_add_asm(s1[6], first_row[6], acc_b); - acc_a = mul_add_asm(s0[7], first_row[7], acc_a); - acc_b = mul_add_asm(s1[7], first_row[7], acc_b); - - // Tail updates: both lanes, interleaved. - s0[1] = mul_add_asm(old_a, v[0], s0[1]); - s1[1] = mul_add_asm(old_b, v[0], s1[1]); - s0[2] = mul_add_asm(old_a, v[1], s0[2]); - s1[2] = mul_add_asm(old_b, v[1], s1[2]); - s0[3] = mul_add_asm(old_a, v[2], s0[3]); - s1[3] = mul_add_asm(old_b, v[2], s1[3]); - s0[4] = mul_add_asm(old_a, v[3], s0[4]); - s1[4] = mul_add_asm(old_b, v[3], s1[4]); - s0[5] = mul_add_asm(old_a, v[4], s0[5]); - s1[5] = mul_add_asm(old_b, v[4], s1[5]); - s0[6] = mul_add_asm(old_a, v[5], s0[6]); - s1[6] = mul_add_asm(old_b, v[5], s1[6]); - s0[7] = mul_add_asm(old_a, v[6], s0[7]); - s1[7] = mul_add_asm(old_b, v[6], s1[7]); - - // Write the dot-product results into the first slots. - s0[0] = acc_a; - s1[0] = acc_b; - } -} - -/// Dual-lane sparse matrix-vector multiply for a width-12 state. -/// -/// Same as the width-8 dual variant but with 12-element states. -/// Uses loops instead of full unrolling since width 12 is large -/// enough that code size matters more than marginal scheduling gains. -#[inline(always)] -pub unsafe fn cheap_matmul_dual_asm_w12( - s0: &mut [u64; 12], - s1: &mut [u64; 12], - first_row: &[u64; 12], - v: &[u64; 12], -) { - unsafe { - // Capture the original first elements from both lanes. - let old_a = s0[0]; - let old_b = s1[0]; - - // Dot products: one per lane, interleaved. - let mut acc_a = mul_asm(s0[0], first_row[0]); - let mut acc_b = mul_asm(s1[0], first_row[0]); - for i in 1..12 { - acc_a = mul_add_asm(s0[i], first_row[i], acc_a); - acc_b = mul_add_asm(s1[i], first_row[i], acc_b); - } - - // Tail updates: both lanes. - for i in 1..12 { - s0[i] = mul_add_asm(old_a, v[i - 1], s0[i]); - s1[i] = mul_add_asm(old_b, v[i - 1], s1[i]); - } - - // Write the dot-product results into the first slots. - s0[0] = acc_a; - s1[0] = acc_b; - } -} - -// --------------------------------------------------------------------------- -// Dense matrix-vector multiply (full-round linear layer) -// --------------------------------------------------------------------------- - -/// Dense matrix-vector multiply for a width-8 state. -/// -/// Computes `state = M * state` where M is a full 8x8 MDS matrix -/// stored in row-major order. Used in the **full rounds** of the -/// permutation where every element is mixed with every other. -/// -/// Each output element is the dot product of one matrix row with the -/// input vector. The input is snapshotted before any writes occur. -pub fn dense_matmul_asm_w8(state: &mut [u64; 8], m: &[[u64; 8]; 8]) { - unsafe { - // Snapshot the current state so reads are not clobbered by writes. - let input = *state; - - // Compute each output element as a dot product of one matrix - // row with the snapshotted input. - for i in 0..8 { - let mut acc = mul_asm(input[0], m[i][0]); - for j in 1..8 { - acc = mul_add_asm(input[j], m[i][j], acc); - } - state[i] = acc; - } - } -} - -/// Dense matrix-vector multiply for a width-12 state. -/// -/// Same as the width-8 variant but with a 12×12 MDS matrix. -pub fn dense_matmul_asm_w12(state: &mut [u64; 12], m: &[[u64; 12]; 12]) { - unsafe { - // Snapshot the current state. - let input = *state; - - // One dot product per output element. - for i in 0..12 { - let mut acc = mul_asm(input[0], m[i][0]); - for j in 1..12 { - acc = mul_add_asm(input[j], m[i][j], acc); - } - state[i] = acc; - } - } -} - -/// Dual-lane dense matrix-vector multiply for a width-8 state. -/// -/// Multiplies two independent state vectors by the same 8×8 matrix. -/// Both lanes share the matrix but have their own input and output. -/// -/// Interleaving the two dot-product chains per row hides latency. -pub fn dense_matmul_dual_asm_w8(s0: &mut [u64; 8], s1: &mut [u64; 8], m: &[[u64; 8]; 8]) { - unsafe { - // Snapshot both input vectors. - let in0 = *s0; - let in1 = *s1; - - // For each row, compute both dot products in lockstep. - for i in 0..8 { - let mut a = mul_asm(in0[0], m[i][0]); - let mut b = mul_asm(in1[0], m[i][0]); - for j in 1..8 { - a = mul_add_asm(in0[j], m[i][j], a); - b = mul_add_asm(in1[j], m[i][j], b); - } - s0[i] = a; - s1[i] = b; - } - } -} - -/// Dual-lane dense matrix-vector multiply for a width-12 state. -/// -/// Same as the width-8 dual variant but with a 12×12 matrix. -pub fn dense_matmul_dual_asm_w12(s0: &mut [u64; 12], s1: &mut [u64; 12], m: &[[u64; 12]; 12]) { - unsafe { - // Snapshot both input vectors. - let in0 = *s0; - let in1 = *s1; - - // For each row, compute both dot products in lockstep. - for i in 0..12 { - let mut a = mul_asm(in0[0], m[i][0]); - let mut b = mul_asm(in1[0], m[i][0]); - for j in 1..12 { - a = mul_add_asm(in0[j], m[i][j], a); - b = mul_add_asm(in1[j], m[i][j], b); - } - s0[i] = a; - s1[i] = b; - } - } -} - -// --------------------------------------------------------------------------- -// Round-constant addition -// --------------------------------------------------------------------------- - -/// Add round constants to every element of the state. -/// -/// This is the first step of every Poseidon1 round. Each element -/// receives its own constant, added in the Goldilocks field. -/// -/// Generic over the state width to work with both width-8 and width-12. -#[inline(always)] -pub unsafe fn add_rc_asm(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) { - unsafe { - // Element-wise modular addition. - for i in 0..WIDTH { - state[i] = add_asm(state[i], rc[i]); - } - } -} - -/// Dual-lane round-constant addition. -/// -/// Adds the same constants to two independent states. Both lanes -/// share the constants because they are at the same round position. -#[inline(always)] -pub unsafe fn add_rc_dual_asm( - s0: &mut [u64; WIDTH], - s1: &mut [u64; WIDTH], - rc: &[u64; WIDTH], -) { - unsafe { - // Both lanes receive the same constant at each position. - for i in 0..WIDTH { - s0[i] = add_asm(s0[i], rc[i]); - s1[i] = add_asm(s1[i], rc[i]); - } - } -} - -/// Add a single round constant to the first element only. -/// -/// Used in partial rounds where only the first element enters the -/// S-box and thus only needs its own constant added. -#[inline(always)] -pub unsafe fn add_scalar_s0_asm(state: &mut [u64], rc: u64) { - unsafe { - // Only the first element is modified. - state[0] = add_asm(state[0], rc); - } -} - -#[cfg(test)] -mod tests { - use p3_field::PrimeField64; - use proptest::prelude::*; - use rand::SeedableRng; - use rand::rngs::SmallRng; - - use super::*; - use crate::Goldilocks; - - type F = Goldilocks; - - /// Reduce a raw `u64` to its canonical Goldilocks representative. - /// - /// Wraps the value into a field element and extracts the unique - /// representative in `[0, P)`. This is the single source of truth - /// for comparing ASM outputs (which may carry unreduced values) - /// against field-level references. - fn canon(x: u64) -> u64 { - F::new(x).as_canonical_u64() - } - - proptest! { - // ================================================================ - // S-box: first element raised to the 7th power - // ================================================================ - - /// Verify the single-lane S-box against a field-level reference. - /// - /// The reference computes x^7 step by step using field multiplication. - /// Only the first element should change; the rest must be untouched. - #[test] - fn test_sbox_s0_asm(vals in prop::array::uniform8(any::())) { - // Build the expected x^7 using the field multiplication chain. - let x = F::new(vals[0]); - let x2 = x * x; - let x3 = x2 * x; - let x4 = x2 * x2; - let expected_s0 = (x3 * x4).as_canonical_u64(); - - // Run the ASM version on a copy. - let mut state = vals; - unsafe { sbox_s0_asm(&mut state); } - - // The first element must match x^7. - prop_assert_eq!(canon(state[0]), expected_s0); - - // Every other element must be unchanged. - for i in 1..8 { - prop_assert_eq!(state[i], vals[i]); - } - } - - /// Verify the dual-lane S-box matches two independent single-lane calls. - /// - /// Runs the single-lane version on each lane separately as the - /// reference, then checks the dual-lane version produces the same. - #[test] - fn test_sbox_s0_dual_asm( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - ) { - // Build the reference by running single-lane on each lane. - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - sbox_s0_asm(&mut ref0); - sbox_s0_asm(&mut ref1); - } - - // Run the dual-lane version. - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { sbox_s0_dual_asm(&mut s0, &mut s1); } - - // Both first elements must match their reference. - prop_assert_eq!(canon(s0[0]), canon(ref0[0])); - prop_assert_eq!(canon(s1[0]), canon(ref1[0])); - - // All other elements must be unchanged. - for i in 1..8 { - prop_assert_eq!(s0[i], vals0[i]); - prop_assert_eq!(s1[i], vals1[i]); - } - } - - // ================================================================ - // Round-constant addition: element-wise field addition - // ================================================================ - - /// Verify round-constant addition (width 8) against field addition. - /// - /// Each element should equal the field sum of the original value - /// and its corresponding round constant. - #[test] - fn test_add_rc_asm_w8( - vals in prop::array::uniform8(any::()), - rc in prop::array::uniform8(any::()), - ) { - // Build the expected result using field addition. - let expected: [u64; 8] = core::array::from_fn(|i| { - (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64() - }); - - // Run the ASM version. - let mut state = vals; - unsafe { add_rc_asm(&mut state, &rc); } - - // Every element must match. - for i in 0..8 { - prop_assert_eq!(canon(state[i]), expected[i]); - } - } - - /// Same verification for width 12. - #[test] - fn test_add_rc_asm_w12( - vals in prop::array::uniform12(any::()), - rc in prop::array::uniform12(any::()), - ) { - let expected: [u64; 12] = core::array::from_fn(|i| { - (F::new(vals[i]) + F::new(rc[i])).as_canonical_u64() - }); - - let mut state = vals; - unsafe { add_rc_asm(&mut state, &rc); } - - for i in 0..12 { - prop_assert_eq!(canon(state[i]), expected[i]); - } - } - - /// Verify dual-lane round-constant addition (width 8) matches - /// two independent single-lane calls. - #[test] - fn test_add_rc_dual_asm_w8( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - rc in prop::array::uniform8(any::()), - ) { - // Reference: single-lane on each independently. - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - add_rc_asm(&mut ref0, &rc); - add_rc_asm(&mut ref1, &rc); - } - - // Run the dual-lane version. - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); } - - // Both lanes must match their references. - for i in 0..8 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - - /// Same dual-lane verification for width 12. - #[test] - fn test_add_rc_dual_asm_w12( - vals0 in prop::array::uniform12(any::()), - vals1 in prop::array::uniform12(any::()), - rc in prop::array::uniform12(any::()), - ) { - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - add_rc_asm(&mut ref0, &rc); - add_rc_asm(&mut ref1, &rc); - } - - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { add_rc_dual_asm(&mut s0, &mut s1, &rc); } - - for i in 0..12 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - - // ================================================================ - // Scalar addition: first element only - // ================================================================ - - /// Verify that adding a scalar to the first element matches - /// field addition, and that all other elements are untouched. - #[test] - fn test_add_scalar_s0_asm(vals in prop::array::uniform8(any::()), rc: u64) { - // Expected: field sum of the first element and the constant. - let expected_s0 = (F::new(vals[0]) + F::new(rc)).as_canonical_u64(); - - // Run the ASM version. - let mut state = vals; - unsafe { add_scalar_s0_asm(&mut state, rc); } - - // The first element must match. - prop_assert_eq!(canon(state[0]), expected_s0); - - // Every other element must be unchanged. - for i in 1..8 { - prop_assert_eq!(state[i], vals[i]); - } - } - - // ================================================================ - // Sparse matrix-vector multiply (partial-round linear layer) - // - // The sparse matrix decomposes into: - // new[0] = dot(first_row, state) - // new[i] = state[i] + state[0] * v[i-1] for i >= 1 - // ================================================================ - - /// Verify the width-8 sparse matmul against a field-level reference. - /// - /// Builds the expected result by computing the dot product and - /// the per-element multiply-add using Goldilocks field operations. - #[test] - fn test_cheap_matmul_asm_w8( - vals in prop::array::uniform8(any::()), - first_row in prop::array::uniform8(any::()), - v in prop::array::uniform8(any::()), - ) { - // Lift raw values into field elements. - let f: [F; 8] = vals.map(F::new); - let fr: [F; 8] = first_row.map(F::new); - let fv: [F; 8] = v.map(F::new); - - // Capture the original first element. - let old_s0 = f[0]; - - // Dot product for the new first element. - let new_s0: F = (0..8).map(|i| f[i] * fr[i]).sum(); - - // Tail update for elements 1..8. - let mut expected = f; - for i in 1..8 { - expected[i] = f[i] + old_s0 * fv[i - 1]; - } - expected[0] = new_s0; - - // Run the ASM version. - let mut state = vals; - unsafe { cheap_matmul_asm_w8(&mut state, &first_row, &v); } - - // Every element must match. - for i in 0..8 { - prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); - } - } - - /// Same verification for width 12. - #[test] - fn test_cheap_matmul_asm_w12( - vals in prop::array::uniform12(any::()), - first_row in prop::array::uniform12(any::()), - v in prop::array::uniform12(any::()), - ) { - let f: [F; 12] = vals.map(F::new); - let fr: [F; 12] = first_row.map(F::new); - let fv: [F; 12] = v.map(F::new); - - let old_s0 = f[0]; - let new_s0: F = (0..12).map(|i| f[i] * fr[i]).sum(); - - let mut expected = f; - for i in 1..12 { - expected[i] = f[i] + old_s0 * fv[i - 1]; - } - expected[0] = new_s0; - - let mut state = vals; - unsafe { cheap_matmul_asm_w12(&mut state, &first_row, &v); } - - for i in 0..12 { - prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); - } - } - - /// Verify the width-8 dual-lane sparse matmul matches two - /// independent single-lane calls. - #[test] - fn test_cheap_matmul_dual_asm_w8( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - first_row in prop::array::uniform8(any::()), - v in prop::array::uniform8(any::()), - ) { - // Reference: single-lane on each independently. - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - cheap_matmul_asm_w8(&mut ref0, &first_row, &v); - cheap_matmul_asm_w8(&mut ref1, &first_row, &v); - } - - // Run the dual-lane version. - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { cheap_matmul_dual_asm_w8(&mut s0, &mut s1, &first_row, &v); } - - // Both lanes must match their references. - for i in 0..8 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - - /// Same dual-lane verification for width 12. - #[test] - fn test_cheap_matmul_dual_asm_w12( - vals0 in prop::array::uniform12(any::()), - vals1 in prop::array::uniform12(any::()), - first_row in prop::array::uniform12(any::()), - v in prop::array::uniform12(any::()), - ) { - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - cheap_matmul_asm_w12(&mut ref0, &first_row, &v); - cheap_matmul_asm_w12(&mut ref1, &first_row, &v); - } - - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { cheap_matmul_dual_asm_w12(&mut s0, &mut s1, &first_row, &v); } - - for i in 0..12 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - - // ================================================================ - // Dense matrix-vector multiply (full-round linear layer) - // ================================================================ - - /// Verify the width-8 dense matmul against a field-level reference. - /// - /// Each output element is the dot product of one matrix row with - /// the input vector. The matrix is fixed from a deterministic seed. - #[test] - fn test_dense_matmul_asm_w8(vals in prop::array::uniform8(any::())) { - // Fixed matrix from a deterministic seed. - let mut rng = SmallRng::seed_from_u64(42); - let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng); - - // Reference: standard matrix-vector product using field ops. - let f: [F; 8] = vals.map(F::new); - let expected: [F; 8] = core::array::from_fn(|i| { - (0..8).map(|j| f[j] * F::new(m[i][j])).sum() - }); - - // Run the ASM version. - let mut state = vals; - dense_matmul_asm_w8(&mut state, &m); - - // Every element must match. - for i in 0..8 { - prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); - } - } - - /// Same verification for width 12. - #[test] - fn test_dense_matmul_asm_w12(vals in prop::array::uniform12(any::())) { - let mut rng = SmallRng::seed_from_u64(43); - let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng); - - let f: [F; 12] = vals.map(F::new); - let expected: [F; 12] = core::array::from_fn(|i| { - (0..12).map(|j| f[j] * F::new(m[i][j])).sum() - }); - - let mut state = vals; - dense_matmul_asm_w12(&mut state, &m); - - for i in 0..12 { - prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); - } - } - - /// Verify the width-8 dual-lane dense matmul matches two - /// independent single-lane calls. - #[test] - fn test_dense_matmul_dual_asm_w8( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - ) { - // Fixed matrix from a deterministic seed. - let mut rng = SmallRng::seed_from_u64(44); - let m: [[u64; 8]; 8] = rand::RngExt::random(&mut rng); - - // Reference: single-lane on each independently. - let mut ref0 = vals0; - let mut ref1 = vals1; - dense_matmul_asm_w8(&mut ref0, &m); - dense_matmul_asm_w8(&mut ref1, &m); - - // Run the dual-lane version. - let mut s0 = vals0; - let mut s1 = vals1; - dense_matmul_dual_asm_w8(&mut s0, &mut s1, &m); - - // Both lanes must match their references. - for i in 0..8 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - - /// Same dual-lane verification for width 12. - #[test] - fn test_dense_matmul_dual_asm_w12( - vals0 in prop::array::uniform12(any::()), - vals1 in prop::array::uniform12(any::()), - ) { - let mut rng = SmallRng::seed_from_u64(45); - let m: [[u64; 12]; 12] = rand::RngExt::random(&mut rng); - - let mut ref0 = vals0; - let mut ref1 = vals1; - dense_matmul_asm_w12(&mut ref0, &m); - dense_matmul_asm_w12(&mut ref1, &m); - - let mut s0 = vals0; - let mut s1 = vals1; - dense_matmul_dual_asm_w12(&mut s0, &mut s1, &m); - - for i in 0..12 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs deleted file mode 100644 index cf74b4df8..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2.rs +++ /dev/null @@ -1,652 +0,0 @@ -//! Optimized Poseidon2 for Goldilocks on aarch64. -//! -//! Uses ARM inline assembly with latency hiding via interleaved S-box/MDS computation. -//! Fully unrolled internal rounds for W8, W12, W16. -//! -//! For packed operations, lanes are extracted to scalar, processed with interleaved -//! dual-lane ASM, then repacked. This is faster than using PackedGoldilocksNeon -//! arithmetic directly because the scalar `add_asm` avoids the modular reduction -//! overhead present in NEON addition. - -use alloc::vec::Vec; - -use p3_poseidon2::{ - ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, InternalLayer, - InternalLayerConstructor, poseidon2_round_numbers_128, -}; -use p3_symmetric::{CryptographicPermutation, Permutation}; -use rand::distr::{Distribution, StandardUniform}; -use rand::{Rng, RngExt}; - -use super::packing::PackedGoldilocksNeon; -use super::poseidon2_asm::*; -use super::utils::{pack_lanes, unpack_lanes}; -use crate::{Goldilocks, MATRIX_DIAG_20_GOLDILOCKS}; - -/// Degree of the chosen permutation polynomial for Goldilocks. -const GOLDILOCKS_S_BOX_DEGREE: u64 = 7; - -/// ASM-optimized internal layer with split-state s0-in-register, pre-converted constants. -#[derive(Debug, Default, Clone)] -pub struct Poseidon2InternalLayerGoldilocksAsm { - constants_raw: Vec, -} - -impl InternalLayerConstructor for Poseidon2InternalLayerGoldilocksAsm { - fn new_from_constants(internal_constants: Vec) -> Self { - let constants_raw = internal_constants.iter().map(|c| c.value).collect(); - Self { constants_raw } - } -} - -const DIAG_RAW_20: [u64; 20] = { - let mut arr = [0u64; 20]; - let mut i = 0; - while i < 20 { - arr[i] = MATRIX_DIAG_20_GOLDILOCKS[i].value; - i += 1; - } - arr -}; - -impl InternalLayer for Poseidon2InternalLayerGoldilocksAsm { - fn permute_state(&self, state: &mut [Goldilocks; 8]) { - let state_raw: &mut [u64; 8] = - unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; - internal_permute_state_asm_w8(state_raw, &self.constants_raw); - } -} - -impl InternalLayer - for Poseidon2InternalLayerGoldilocksAsm -{ - fn permute_state(&self, state: &mut [Goldilocks; 12]) { - let state_raw: &mut [u64; 12] = - unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; - internal_permute_state_asm_w12(state_raw, &self.constants_raw); - } -} - -impl InternalLayer - for Poseidon2InternalLayerGoldilocksAsm -{ - fn permute_state(&self, state: &mut [Goldilocks; 16]) { - let state_raw: &mut [u64; 16] = - unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; - internal_permute_state_asm_w16(state_raw, &self.constants_raw); - } -} - -impl InternalLayer - for Poseidon2InternalLayerGoldilocksAsm -{ - fn permute_state(&self, state: &mut [Goldilocks; 20]) { - let state_raw: &mut [u64; 20] = - unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; - internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.constants_raw); - } -} - -#[derive(Clone)] -pub struct Poseidon2ExternalLayerGoldilocksAsm { - initial_constants_raw: Vec<[u64; WIDTH]>, - terminal_constants_raw: Vec<[u64; WIDTH]>, -} - -impl ExternalLayerConstructor - for Poseidon2ExternalLayerGoldilocksAsm -{ - fn new_from_constants(external_constants: ExternalLayerConstants) -> Self { - let initial_constants_raw = external_constants - .get_initial_constants() - .iter() - .map(|rc| core::array::from_fn(|i| rc[i].value)) - .collect(); - let terminal_constants_raw = external_constants - .get_terminal_constants() - .iter() - .map(|rc| core::array::from_fn(|i| rc[i].value)) - .collect(); - Self { - initial_constants_raw, - terminal_constants_raw, - } - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<8> -{ - fn permute_state_initial(&self, state: &mut [Goldilocks; 8]) { - let state_raw: &mut [u64; 8] = - unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; - external_initial_permute_w8(state_raw, &self.initial_constants_raw); - } - - fn permute_state_terminal(&self, state: &mut [Goldilocks; 8]) { - let state_raw: &mut [u64; 8] = - unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; - external_terminal_permute_w8(state_raw, &self.terminal_constants_raw); - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<12> -{ - fn permute_state_initial(&self, state: &mut [Goldilocks; 12]) { - let state_raw: &mut [u64; 12] = - unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; - external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); - } - - fn permute_state_terminal(&self, state: &mut [Goldilocks; 12]) { - let state_raw: &mut [u64; 12] = - unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; - external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<16> -{ - fn permute_state_initial(&self, state: &mut [Goldilocks; 16]) { - let state_raw: &mut [u64; 16] = - unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; - external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); - } - - fn permute_state_terminal(&self, state: &mut [Goldilocks; 16]) { - let state_raw: &mut [u64; 16] = - unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; - external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<20> -{ - fn permute_state_initial(&self, state: &mut [Goldilocks; 20]) { - let state_raw: &mut [u64; 20] = - unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; - external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); - } - - fn permute_state_terminal(&self, state: &mut [Goldilocks; 20]) { - let state_raw: &mut [u64; 20] = - unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; - external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); - } -} - -/// Type alias for scalar ASM-optimized Poseidon2. -pub type Poseidon2GoldilocksAsm = p3_poseidon2::Poseidon2< - Goldilocks, - Poseidon2ExternalLayerGoldilocksAsm, - Poseidon2InternalLayerGoldilocksAsm, - WIDTH, - GOLDILOCKS_S_BOX_DEGREE, ->; - -impl InternalLayer - for Poseidon2InternalLayerGoldilocksAsm -{ - fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 8]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl InternalLayer - for Poseidon2InternalLayerGoldilocksAsm -{ - fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 12]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - internal_permute_split_dual_w12(&mut lane0, &mut lane1, &self.constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl InternalLayer - for Poseidon2InternalLayerGoldilocksAsm -{ - fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 16]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - internal_permute_split_dual_w16(&mut lane0, &mut lane1, &self.constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl InternalLayer - for Poseidon2InternalLayerGoldilocksAsm -{ - fn permute_state(&self, state: &mut [PackedGoldilocksNeon; 20]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - internal_permute_split_dual(&mut lane0, &mut lane1, &DIAG_RAW_20, &self.constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<8> -{ - fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 8]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw); - pack_lanes(state, &lane0, &lane1); - } - - fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 8]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<12> -{ - fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 12]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw); - pack_lanes(state, &lane0, &lane1); - } - - fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 12]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<16> -{ - fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 16]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw); - pack_lanes(state, &lane0, &lane1); - } - - fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 16]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl ExternalLayer - for Poseidon2ExternalLayerGoldilocksAsm<20> -{ - fn permute_state_initial(&self, state: &mut [PackedGoldilocksNeon; 20]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_initial_permute_dual(&mut lane0, &mut lane1, &self.initial_constants_raw); - pack_lanes(state, &lane0, &lane1); - } - - fn permute_state_terminal(&self, state: &mut [PackedGoldilocksNeon; 20]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_terminal_permute_dual(&mut lane0, &mut lane1, &self.terminal_constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -/// Fused Poseidon2 permutation for Goldilocks. -/// -/// Instead of unpacking/packing between each of the 3 phases (initial external, -/// internal, terminal external), this performs a single unpack at the start and -/// a single pack at the end, eliminating the redundant lane conversions per -/// packed permutation. -#[derive(Clone, Debug)] -pub struct Poseidon2GoldilocksFused { - internal_constants_raw: Vec, - initial_constants_raw: Vec<[u64; WIDTH]>, - terminal_constants_raw: Vec<[u64; WIDTH]>, -} - -impl Poseidon2GoldilocksFused { - pub fn new( - external_constants: &ExternalLayerConstants, - internal_constants: &[Goldilocks], - ) -> Self { - let internal_constants_raw = internal_constants.iter().map(|c| c.value).collect(); - let initial_constants_raw = external_constants - .get_initial_constants() - .iter() - .map(|rc| core::array::from_fn(|i| rc[i].value)) - .collect(); - let terminal_constants_raw = external_constants - .get_terminal_constants() - .iter() - .map(|rc| core::array::from_fn(|i| rc[i].value)) - .collect(); - Self { - internal_constants_raw, - initial_constants_raw, - terminal_constants_raw, - } - } - - pub fn new_from_rng(rounds_f: usize, rounds_p: usize, rng: &mut R) -> Self - where - StandardUniform: Distribution + Distribution<[Goldilocks; WIDTH]>, - { - let external_constants = ExternalLayerConstants::new_from_rng(rounds_f, rng); - let internal_constants = rng - .sample_iter(StandardUniform) - .take(rounds_p) - .collect::>(); - Self::new(&external_constants, &internal_constants) - } - - pub fn new_from_rng_128(rng: &mut R) -> Self - where - StandardUniform: Distribution + Distribution<[Goldilocks; WIDTH]>, - { - let round_numbers = - poseidon2_round_numbers_128::(WIDTH, GOLDILOCKS_S_BOX_DEGREE); - let (rounds_f, rounds_p) = round_numbers.unwrap_or_else(|e| panic!("{e}")); - Self::new_from_rng(rounds_f, rounds_p, rng) - } -} - -impl Permutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> { - fn permute_mut(&self, state: &mut [Goldilocks; 8]) { - let state_raw: &mut [u64; 8] = - unsafe { &mut *(state as *mut [Goldilocks; 8] as *mut [u64; 8]) }; - external_initial_permute_w8(state_raw, &self.initial_constants_raw); - internal_permute_state_asm_w8(state_raw, &self.internal_constants_raw); - external_terminal_permute_w8(state_raw, &self.terminal_constants_raw); - } -} - -impl CryptographicPermutation<[Goldilocks; 8]> for Poseidon2GoldilocksFused<8> {} - -impl Permutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> { - fn permute_mut(&self, state: &mut [Goldilocks; 12]) { - let state_raw: &mut [u64; 12] = - unsafe { &mut *(state as *mut [Goldilocks; 12] as *mut [u64; 12]) }; - external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); - internal_permute_state_asm_w12(state_raw, &self.internal_constants_raw); - external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); - } -} - -impl CryptographicPermutation<[Goldilocks; 12]> for Poseidon2GoldilocksFused<12> {} - -impl Permutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> { - fn permute_mut(&self, state: &mut [Goldilocks; 16]) { - let state_raw: &mut [u64; 16] = - unsafe { &mut *(state as *mut [Goldilocks; 16] as *mut [u64; 16]) }; - external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); - internal_permute_state_asm_w16(state_raw, &self.internal_constants_raw); - external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); - } -} - -impl CryptographicPermutation<[Goldilocks; 16]> for Poseidon2GoldilocksFused<16> {} - -impl Permutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> { - fn permute_mut(&self, state: &mut [Goldilocks; 20]) { - let state_raw: &mut [u64; 20] = - unsafe { &mut *(state as *mut [Goldilocks; 20] as *mut [u64; 20]) }; - external_initial_permute_state_asm(state_raw, &self.initial_constants_raw); - internal_permute_state_asm(state_raw, &DIAG_RAW_20, &self.internal_constants_raw); - external_terminal_permute_state_asm(state_raw, &self.terminal_constants_raw); - } -} - -impl CryptographicPermutation<[Goldilocks; 20]> for Poseidon2GoldilocksFused<20> {} - -impl Permutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 8]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - external_initial_permute_dual_w8(&mut lane0, &mut lane1, &self.initial_constants_raw); - internal_permute_split_dual_w8(&mut lane0, &mut lane1, &self.internal_constants_raw); - external_terminal_permute_dual_w8(&mut lane0, &mut lane1, &self.terminal_constants_raw); - pack_lanes(state, &lane0, &lane1); - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 8]> for Poseidon2GoldilocksFused<8> {} - -impl Permutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 12]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - let mut sv = lanes_to_neon(&lane0, &lane1); - external_initial_neon(&mut sv, &self.initial_constants_raw); - internal_permute_neon_w12(&mut sv, &self.internal_constants_raw); - external_terminal_neon(&mut sv, &self.terminal_constants_raw); - neon_to_lanes(&sv, &mut lane0, &mut lane1); - pack_lanes(state, &lane0, &lane1); - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 12]> for Poseidon2GoldilocksFused<12> {} - -impl Permutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 16]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - let mut sv = lanes_to_neon(&lane0, &lane1); - external_initial_neon(&mut sv, &self.initial_constants_raw); - internal_permute_neon_w16(&mut sv, &self.internal_constants_raw); - external_terminal_neon(&mut sv, &self.terminal_constants_raw); - neon_to_lanes(&sv, &mut lane0, &mut lane1); - pack_lanes(state, &lane0, &lane1); - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 16]> for Poseidon2GoldilocksFused<16> {} - -impl Permutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> { - fn permute_mut(&self, state: &mut [PackedGoldilocksNeon; 20]) { - let (mut lane0, mut lane1) = unpack_lanes(state); - let mut sv = lanes_to_neon(&lane0, &lane1); - external_initial_neon(&mut sv, &self.initial_constants_raw); - internal_permute_neon(&mut sv, &DIAG_RAW_20, &self.internal_constants_raw); - external_terminal_neon(&mut sv, &self.terminal_constants_raw); - neon_to_lanes(&sv, &mut lane0, &mut lane1); - pack_lanes(state, &lane0, &lane1); - } -} - -impl CryptographicPermutation<[PackedGoldilocksNeon; 20]> for Poseidon2GoldilocksFused<20> {} - -#[cfg(test)] -mod tests { - use p3_field::{PrimeCharacteristicRing, PrimeField64}; - use p3_poseidon2::{ExternalLayerConstants, InternalLayer, Poseidon2}; - use p3_symmetric::Permutation; - use rand::rngs::SmallRng; - use rand::{RngExt, SeedableRng}; - - use super::*; - use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE; - use crate::{ - GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8, - Poseidon2ExternalLayerGoldilocks, Poseidon2InternalLayerGoldilocks, - }; - - type F = Goldilocks; - - // Test that fully ASM-optimized implementation matches generic scalar - fn test_asm_matches_generic() - where - Poseidon2InternalLayerGoldilocks: InternalLayer, - Poseidon2InternalLayerGoldilocksAsm: InternalLayer, - Poseidon2ExternalLayerGoldilocksAsm: - ExternalLayer, - { - let mut rng = SmallRng::seed_from_u64(42); - - let external_constants = ExternalLayerConstants::::new_from_rng( - 2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, - &mut rng, - ); - let internal_constants: Vec = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8) - .map(|_| F::from_u64(rng.random())) - .collect(); - - // Generic scalar implementation - let generic_poseidon2: Poseidon2< - Goldilocks, - Poseidon2ExternalLayerGoldilocks, - Poseidon2InternalLayerGoldilocks, - WIDTH, - GOLDILOCKS_S_BOX_DEGREE, - > = Poseidon2::new(external_constants.clone(), internal_constants.clone()); - - // Fully ASM-optimized implementation - let asm_poseidon2: Poseidon2GoldilocksAsm = - Poseidon2::new(external_constants, internal_constants); - - // Test with zeros - let mut generic_input = [F::ZERO; WIDTH]; - let mut asm_input = [F::ZERO; WIDTH]; - - generic_poseidon2.permute_mut(&mut generic_input); - asm_poseidon2.permute_mut(&mut asm_input); - - for i in 0..WIDTH { - assert_eq!( - asm_input[i].as_canonical_u64(), - generic_input[i].as_canonical_u64(), - "ASM mismatch at index {i} for zero input" - ); - } - - // Test with random input - let mut generic_input: [F; WIDTH] = core::array::from_fn(|_| F::from_u64(rng.random())); - let mut asm_input = generic_input; - - generic_poseidon2.permute_mut(&mut generic_input); - asm_poseidon2.permute_mut(&mut asm_input); - - for i in 0..WIDTH { - assert_eq!( - asm_input[i].as_canonical_u64(), - generic_input[i].as_canonical_u64(), - "ASM mismatch at index {i} for random input" - ); - } - } - - fn test_fused_matches_generic() - where - Poseidon2InternalLayerGoldilocks: InternalLayer, - Poseidon2GoldilocksFused: - Permutation<[F; WIDTH]> + Permutation<[PackedGoldilocksNeon; WIDTH]>, - { - let mut rng = SmallRng::seed_from_u64(42); - - let external_constants = ExternalLayerConstants::::new_from_rng( - 2 * GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS, - &mut rng, - ); - let internal_constants: Vec = (0..GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8) - .map(|_| rng.random()) - .collect(); - - let generic_poseidon2: Poseidon2< - Goldilocks, - Poseidon2ExternalLayerGoldilocks, - Poseidon2InternalLayerGoldilocks, - WIDTH, - GOLDILOCKS_S_BOX_DEGREE, - > = Poseidon2::new(external_constants.clone(), internal_constants.clone()); - - let fused = - Poseidon2GoldilocksFused::::new(&external_constants, &internal_constants); - - // Scalar: fused vs generic - let mut generic_input = [F::ZERO; WIDTH]; - let mut fused_input = [F::ZERO; WIDTH]; - generic_poseidon2.permute_mut(&mut generic_input); - fused.permute_mut(&mut fused_input); - for i in 0..WIDTH { - assert_eq!( - fused_input[i].as_canonical_u64(), - generic_input[i].as_canonical_u64(), - "Fused scalar mismatch at index {i} for zero input" - ); - } - - let mut generic_input: [F; WIDTH] = rng.random(); - let mut fused_input = generic_input; - generic_poseidon2.permute_mut(&mut generic_input); - fused.permute_mut(&mut fused_input); - for i in 0..WIDTH { - assert_eq!( - fused_input[i].as_canonical_u64(), - generic_input[i].as_canonical_u64(), - "Fused scalar mismatch at index {i} for random input" - ); - } - - // Packed: fused packed vs scalar (each packed lane should match scalar) - let scalar_a: [F; WIDTH] = rng.random(); - let scalar_b: [F; WIDTH] = rng.random(); - - let mut packed_input: [PackedGoldilocksNeon; WIDTH] = - core::array::from_fn(|i| PackedGoldilocksNeon([scalar_a[i], scalar_b[i]])); - fused.permute_mut(&mut packed_input); - - let mut expected_a = scalar_a; - let mut expected_b = scalar_b; - fused.permute_mut(&mut expected_a); - fused.permute_mut(&mut expected_b); - - for i in 0..WIDTH { - assert_eq!( - packed_input[i].0[0].as_canonical_u64(), - expected_a[i].as_canonical_u64(), - "Fused packed lane0 mismatch at index {i}" - ); - assert_eq!( - packed_input[i].0[1].as_canonical_u64(), - expected_b[i].as_canonical_u64(), - "Fused packed lane1 mismatch at index {i}" - ); - } - } - - #[test] - fn test_asm_matches_generic_width_8() { - test_asm_matches_generic::<8>(); - } - - #[test] - fn test_asm_matches_generic_width_12() { - test_asm_matches_generic::<12>(); - } - - #[test] - fn test_asm_matches_generic_width_16() { - test_asm_matches_generic::<16>(); - } - - #[test] - fn test_asm_matches_generic_width_20() { - test_asm_matches_generic::<20>(); - } - - #[test] - fn test_fused_matches_generic_width_8() { - test_fused_matches_generic::<8>(); - } - - #[test] - fn test_fused_matches_generic_width_12() { - test_fused_matches_generic::<12>(); - } - - #[test] - fn test_fused_matches_generic_width_16() { - test_fused_matches_generic::<16>(); - } - - #[test] - fn test_fused_matches_generic_width_20() { - test_fused_matches_generic::<20>(); - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs deleted file mode 100644 index 00b7fdc57..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/poseidon2_asm.rs +++ /dev/null @@ -1,2621 +0,0 @@ -//! ARM assembly primitives for Poseidon2 on Goldilocks. -//! -//! Latency hiding: ARM mul/umulh have ~4-5 cycle latency. By interleaving -//! S-box computation with MDS operations, we hide much of this latency. - -use core::arch::aarch64::*; -use core::arch::asm; - -use super::utils::{add_asm, mul_add_asm, mul_asm}; -use crate::P; - -/// Compute x / 2 in the Goldilocks field, matching `halve_u64::

`. -#[inline(always)] -unsafe fn div2_asm(x: u64) -> u64 { - let shift = (P + 1) >> 1; - let result: u64; - let _tmp: u64; - - unsafe { - asm!( - // result = x >> 1 - "lsr {result}, {x}, #1", - // tmp = x & 1 - "and {tmp}, {x}, #1", - // if tmp != 0 (x odd), tmp := shift, else tmp := 0 - "cmp {tmp}, #0", - "csel {tmp}, {shift}, xzr, ne", - // result += tmp - "add {result}, {result}, {tmp}", - x = in(reg) x, - shift = in(reg) shift, - tmp = out(reg) _tmp, - result = out(reg) result, - options(pure, nomem, nostack), - ); - } - - result -} - -#[inline(always)] -unsafe fn div4_asm(x: u64) -> u64 { - unsafe { div2_asm(div2_asm(x)) } -} - -#[inline(always)] -unsafe fn div8_asm(x: u64) -> u64 { - unsafe { div2_asm(div4_asm(x)) } -} - -#[inline(always)] -unsafe fn div16_asm(x: u64) -> u64 { - unsafe { div2_asm(div8_asm(x)) } -} - -#[inline(always)] -unsafe fn div32_asm(x: u64) -> u64 { - unsafe { div4_asm(div8_asm(x)) } -} - -/// Compute x * 2^{-32} mod P using the Goldilocks structure. -/// -/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P). -/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P, -/// where x_hi = x >> 32, x_lo = x & 0xFFFFFFFF. -#[inline(always)] -unsafe fn div_2_32_asm(x: u64) -> u64 { - let result: u64; - let _hi: u64; - let _lo: u64; - let _t: u64; - let _sum: u64; - let _adj: u64; - - unsafe { - asm!( - "lsr {hi}, {x}, #32", - "and {lo}, {x}, #0xFFFFFFFF", - "add {sum}, {hi}, {lo}", - "lsl {t}, {lo}, #32", - "subs {result}, {sum}, {t}", - "csetm {adj:w}, cc", - "sub {result}, {result}, {adj}", - x = in(reg) x, - hi = out(reg) _hi, - lo = out(reg) _lo, - t = out(reg) _t, - sum = out(reg) _sum, - result = out(reg) result, - adj = lateout(reg) _adj, - options(pure, nomem, nostack), - ); - } - - result -} - -/// Subtract two Goldilocks elements with borrow handling using inline assembly. -#[inline(always)] -unsafe fn sub_asm(a: u64, b: u64) -> u64 { - let result: u64; - let _adj: u64; - - unsafe { - asm!( - "subs {result}, {a}, {b}", - "csetm {adj:w}, cc", - "sub {result}, {result}, {adj}", - a = in(reg) a, - b = in(reg) b, - result = out(reg) result, - adj = out(reg) _adj, - options(pure, nomem, nostack), - ); - } - - result -} - -/// Split-state generic internal permute: s0 stays in a register across all rounds. -#[inline] -#[allow(clippy::needless_range_loop)] -pub fn internal_permute_state_asm( - state: &mut [u64; WIDTH], - diag: &[u64; WIDTH], - constants: &[u64], -) { - let mut s0 = state[0]; - for &rc in constants { - unsafe { - s0 = add_asm(s0, rc); - let s0_2 = mul_asm(s0, s0); - let s0_3 = mul_asm(s0_2, s0); - let s0_4 = mul_asm(s0_2, s0_2); - s0 = mul_asm(s0_3, s0_4); - - let mut sum_hi: u64 = 0; - for i in 1..WIDTH { - sum_hi = add_asm(sum_hi, state[i]); - } - - let mut diag_muls: [u64; WIDTH] = [0; WIDTH]; - for i in 1..WIDTH { - diag_muls[i] = mul_asm(state[i], diag[i]); - } - - let sum = add_asm(sum_hi, s0); - s0 = mul_add_asm(s0, diag[0], sum); - - for i in 1..WIDTH { - state[i] = add_asm(diag_muls[i], sum); - } - } - } - state[0] = s0; -} - -/// Split-state generic dual-lane internal permute for packed processing. -#[inline] -#[allow(clippy::needless_range_loop)] -pub fn internal_permute_split_dual( - lane0: &mut [u64; WIDTH], - lane1: &mut [u64; WIDTH], - diag: &[u64; WIDTH], - constants: &[u64], -) { - let mut s0_a = lane0[0]; - let mut s0_b = lane1[0]; - for &rc in constants { - unsafe { - s0_a = add_asm(s0_a, rc); - s0_b = add_asm(s0_b, rc); - let s0_2_a = mul_asm(s0_a, s0_a); - let s0_2_b = mul_asm(s0_b, s0_b); - let s0_3_a = mul_asm(s0_2_a, s0_a); - let s0_3_b = mul_asm(s0_2_b, s0_b); - let s0_4_a = mul_asm(s0_2_a, s0_2_a); - let s0_4_b = mul_asm(s0_2_b, s0_2_b); - s0_a = mul_asm(s0_3_a, s0_4_a); - s0_b = mul_asm(s0_3_b, s0_4_b); - - let mut sum_hi_a: u64 = 0; - let mut sum_hi_b: u64 = 0; - for i in 1..WIDTH { - sum_hi_a = add_asm(sum_hi_a, lane0[i]); - sum_hi_b = add_asm(sum_hi_b, lane1[i]); - } - - let mut diag_muls_a: [u64; WIDTH] = [0; WIDTH]; - let mut diag_muls_b: [u64; WIDTH] = [0; WIDTH]; - for i in 1..WIDTH { - diag_muls_a[i] = mul_asm(lane0[i], diag[i]); - diag_muls_b[i] = mul_asm(lane1[i], diag[i]); - } - - let sum_a = add_asm(sum_hi_a, s0_a); - let sum_b = add_asm(sum_hi_b, s0_b); - s0_a = mul_add_asm(s0_a, diag[0], sum_a); - s0_b = mul_add_asm(s0_b, diag[0], sum_b); - - for i in 1..WIDTH { - lane0[i] = add_asm(diag_muls_a[i], sum_a); - lane1[i] = add_asm(diag_muls_b[i], sum_b); - } - } - } - lane0[0] = s0_a; - lane1[0] = s0_b; -} - -/// Split-state W8 internal permute: s0 stays in a register across all rounds. -#[inline] -pub fn internal_permute_state_asm_w8(state: &mut [u64; 8], constants: &[u64]) { - let mut s0 = state[0]; - for &rc in constants { - unsafe { - s0 = add_asm(s0, rc); - let s0_2 = mul_asm(s0, s0); - - let sum1 = add_asm(state[1], state[2]); - let sum2 = add_asm(state[3], state[4]); - let sum3 = add_asm(state[5], state[6]); - - let s0_3 = mul_asm(s0_2, s0); - let s0_4 = mul_asm(s0_2, s0_2); - - let sum12 = add_asm(sum1, sum2); - let sum37 = add_asm(sum3, state[7]); - - let d1 = state[1]; - let d2 = double_asm(state[2]); - let d3 = div2_asm(state[3]); - let d4 = add_asm(double_asm(state[4]), state[4]); - - let sum_hi = add_asm(sum12, sum37); - - let d5 = div2_asm(state[5]); - let d6 = add_asm(double_asm(state[6]), state[6]); - let d7 = double_asm(double_asm(state[7])); - - s0 = mul_asm(s0_3, s0_4); - let sum = add_asm(sum_hi, s0); - // V[0]=-2: new_s0 = sum + (-2)*s0 = sum_hi + s0 - 2*s0 = sum_hi - s0 - s0 = sub_asm(sum_hi, s0); - - state[1] = add_asm(d1, sum); - state[2] = add_asm(d2, sum); - state[3] = add_asm(d3, sum); - state[4] = add_asm(d4, sum); - state[5] = sub_asm(sum, d5); - state[6] = sub_asm(sum, d6); - state[7] = sub_asm(sum, d7); - } - } - state[0] = s0; -} - -/// Split-state dual-lane W8 internal permute for packed processing. -#[inline] -pub fn internal_permute_split_dual_w8( - lane0: &mut [u64; 8], - lane1: &mut [u64; 8], - constants: &[u64], -) { - let mut s0_a = lane0[0]; - let mut s0_b = lane1[0]; - for &rc in constants { - unsafe { - s0_a = add_asm(s0_a, rc); - s0_b = add_asm(s0_b, rc); - - let s0_2_a = mul_asm(s0_a, s0_a); - let s0_2_b = mul_asm(s0_b, s0_b); - - let sum1_a = add_asm(lane0[1], lane0[2]); - let sum1_b = add_asm(lane1[1], lane1[2]); - let sum2_a = add_asm(lane0[3], lane0[4]); - let sum2_b = add_asm(lane1[3], lane1[4]); - let sum3_a = add_asm(lane0[5], lane0[6]); - let sum3_b = add_asm(lane1[5], lane1[6]); - - let s0_3_a = mul_asm(s0_2_a, s0_a); - let s0_3_b = mul_asm(s0_2_b, s0_b); - let s0_4_a = mul_asm(s0_2_a, s0_2_a); - let s0_4_b = mul_asm(s0_2_b, s0_2_b); - - let sum12_a = add_asm(sum1_a, sum2_a); - let sum12_b = add_asm(sum1_b, sum2_b); - let sum37_a = add_asm(sum3_a, lane0[7]); - let sum37_b = add_asm(sum3_b, lane1[7]); - - let d1_a = lane0[1]; - let d1_b = lane1[1]; - let d2_a = double_asm(lane0[2]); - let d2_b = double_asm(lane1[2]); - let d3_a = div2_asm(lane0[3]); - let d3_b = div2_asm(lane1[3]); - let d4_a = add_asm(double_asm(lane0[4]), lane0[4]); - let d4_b = add_asm(double_asm(lane1[4]), lane1[4]); - - let sum_hi_a = add_asm(sum12_a, sum37_a); - let sum_hi_b = add_asm(sum12_b, sum37_b); - - let d5_a = div2_asm(lane0[5]); - let d5_b = div2_asm(lane1[5]); - let d6_a = add_asm(double_asm(lane0[6]), lane0[6]); - let d6_b = add_asm(double_asm(lane1[6]), lane1[6]); - let d7_a = double_asm(double_asm(lane0[7])); - let d7_b = double_asm(double_asm(lane1[7])); - - s0_a = mul_asm(s0_3_a, s0_4_a); - s0_b = mul_asm(s0_3_b, s0_4_b); - - let sum_a = add_asm(sum_hi_a, s0_a); - let sum_b = add_asm(sum_hi_b, s0_b); - s0_a = sub_asm(sum_hi_a, s0_a); - s0_b = sub_asm(sum_hi_b, s0_b); - - lane0[1] = add_asm(d1_a, sum_a); - lane1[1] = add_asm(d1_b, sum_b); - lane0[2] = add_asm(d2_a, sum_a); - lane1[2] = add_asm(d2_b, sum_b); - lane0[3] = add_asm(d3_a, sum_a); - lane1[3] = add_asm(d3_b, sum_b); - lane0[4] = add_asm(d4_a, sum_a); - lane1[4] = add_asm(d4_b, sum_b); - lane0[5] = sub_asm(sum_a, d5_a); - lane1[5] = sub_asm(sum_b, d5_b); - lane0[6] = sub_asm(sum_a, d6_a); - lane1[6] = sub_asm(sum_b, d6_b); - lane0[7] = sub_asm(sum_a, d7_a); - lane1[7] = sub_asm(sum_b, d7_b); - } - } - lane0[0] = s0_a; - lane1[0] = s0_b; -} - -/// Split-state W12 internal permute: s0 stays in a register across all rounds. -#[inline] -pub fn internal_permute_state_asm_w12(state: &mut [u64; 12], constants: &[u64]) { - let mut s0 = state[0]; - for &rc in constants { - unsafe { - s0 = add_asm(s0, rc); - let s0_2 = mul_asm(s0, s0); - - let sum1 = add_asm(state[1], state[2]); - let sum2 = add_asm(state[3], state[4]); - let sum3 = add_asm(state[5], state[6]); - let sum4 = add_asm(state[7], state[8]); - let sum5 = add_asm(state[9], state[10]); - - let s0_3 = mul_asm(s0_2, s0); - let s0_4 = mul_asm(s0_2, s0_2); - - let sum12 = add_asm(sum1, sum2); - let sum34 = add_asm(sum3, sum4); - let sum511 = add_asm(sum5, state[11]); - - let d1 = state[1]; - let d2 = double_asm(state[2]); - let d3 = div2_asm(state[3]); - let d4 = add_asm(double_asm(state[4]), state[4]); - - let sum1234 = add_asm(sum12, sum34); - - let d5 = double_asm(double_asm(state[5])); - let d6 = div2_asm(state[6]); - let d7 = add_asm(double_asm(state[7]), state[7]); - let d8 = double_asm(double_asm(state[8])); - - let sum_hi = add_asm(sum1234, sum511); - - let d9 = div4_asm(state[9]); - let d10 = div4_asm(state[10]); - let d11 = div8_asm(state[11]); - - s0 = mul_asm(s0_3, s0_4); - let sum = add_asm(sum_hi, s0); - s0 = sub_asm(sum_hi, s0); - - state[1] = add_asm(d1, sum); - state[2] = add_asm(d2, sum); - state[3] = add_asm(d3, sum); - state[4] = add_asm(d4, sum); - state[5] = add_asm(d5, sum); - state[6] = sub_asm(sum, d6); - state[7] = sub_asm(sum, d7); - state[8] = sub_asm(sum, d8); - state[9] = add_asm(d9, sum); - state[10] = sub_asm(sum, d10); - state[11] = add_asm(d11, sum); - } - } - state[0] = s0; -} - -/// Split-state dual-lane W12 internal permute for packed processing. -#[inline] -pub fn internal_permute_split_dual_w12( - lane0: &mut [u64; 12], - lane1: &mut [u64; 12], - constants: &[u64], -) { - let mut s0_a = lane0[0]; - let mut s0_b = lane1[0]; - for &rc in constants { - unsafe { - s0_a = add_asm(s0_a, rc); - s0_b = add_asm(s0_b, rc); - - let s0_2_a = mul_asm(s0_a, s0_a); - let s0_2_b = mul_asm(s0_b, s0_b); - - let sum1_a = add_asm(lane0[1], lane0[2]); - let sum1_b = add_asm(lane1[1], lane1[2]); - let sum2_a = add_asm(lane0[3], lane0[4]); - let sum2_b = add_asm(lane1[3], lane1[4]); - let sum3_a = add_asm(lane0[5], lane0[6]); - let sum3_b = add_asm(lane1[5], lane1[6]); - let sum4_a = add_asm(lane0[7], lane0[8]); - let sum4_b = add_asm(lane1[7], lane1[8]); - let sum5_a = add_asm(lane0[9], lane0[10]); - let sum5_b = add_asm(lane1[9], lane1[10]); - - let s0_3_a = mul_asm(s0_2_a, s0_a); - let s0_3_b = mul_asm(s0_2_b, s0_b); - let s0_4_a = mul_asm(s0_2_a, s0_2_a); - let s0_4_b = mul_asm(s0_2_b, s0_2_b); - - let sum12_a = add_asm(sum1_a, sum2_a); - let sum12_b = add_asm(sum1_b, sum2_b); - let sum34_a = add_asm(sum3_a, sum4_a); - let sum34_b = add_asm(sum3_b, sum4_b); - let sum511_a = add_asm(sum5_a, lane0[11]); - let sum511_b = add_asm(sum5_b, lane1[11]); - - let d1_a = lane0[1]; - let d1_b = lane1[1]; - let d2_a = double_asm(lane0[2]); - let d2_b = double_asm(lane1[2]); - let d3_a = div2_asm(lane0[3]); - let d3_b = div2_asm(lane1[3]); - let d4_a = add_asm(double_asm(lane0[4]), lane0[4]); - let d4_b = add_asm(double_asm(lane1[4]), lane1[4]); - - let sum1234_a = add_asm(sum12_a, sum34_a); - let sum1234_b = add_asm(sum12_b, sum34_b); - - let d5_a = double_asm(double_asm(lane0[5])); - let d5_b = double_asm(double_asm(lane1[5])); - let d6_a = div2_asm(lane0[6]); - let d6_b = div2_asm(lane1[6]); - let d7_a = add_asm(double_asm(lane0[7]), lane0[7]); - let d7_b = add_asm(double_asm(lane1[7]), lane1[7]); - let d8_a = double_asm(double_asm(lane0[8])); - let d8_b = double_asm(double_asm(lane1[8])); - - let sum_hi_a = add_asm(sum1234_a, sum511_a); - let sum_hi_b = add_asm(sum1234_b, sum511_b); - - let d9_a = div4_asm(lane0[9]); - let d9_b = div4_asm(lane1[9]); - let d10_a = div4_asm(lane0[10]); - let d10_b = div4_asm(lane1[10]); - let d11_a = div8_asm(lane0[11]); - let d11_b = div8_asm(lane1[11]); - - s0_a = mul_asm(s0_3_a, s0_4_a); - s0_b = mul_asm(s0_3_b, s0_4_b); - - let sum_a = add_asm(sum_hi_a, s0_a); - let sum_b = add_asm(sum_hi_b, s0_b); - s0_a = sub_asm(sum_hi_a, s0_a); - s0_b = sub_asm(sum_hi_b, s0_b); - - lane0[1] = add_asm(d1_a, sum_a); - lane1[1] = add_asm(d1_b, sum_b); - lane0[2] = add_asm(d2_a, sum_a); - lane1[2] = add_asm(d2_b, sum_b); - lane0[3] = add_asm(d3_a, sum_a); - lane1[3] = add_asm(d3_b, sum_b); - lane0[4] = add_asm(d4_a, sum_a); - lane1[4] = add_asm(d4_b, sum_b); - lane0[5] = add_asm(d5_a, sum_a); - lane1[5] = add_asm(d5_b, sum_b); - lane0[6] = sub_asm(sum_a, d6_a); - lane1[6] = sub_asm(sum_b, d6_b); - lane0[7] = sub_asm(sum_a, d7_a); - lane1[7] = sub_asm(sum_b, d7_b); - lane0[8] = sub_asm(sum_a, d8_a); - lane1[8] = sub_asm(sum_b, d8_b); - lane0[9] = add_asm(d9_a, sum_a); - lane1[9] = add_asm(d9_b, sum_b); - lane0[10] = sub_asm(sum_a, d10_a); - lane1[10] = sub_asm(sum_b, d10_b); - lane0[11] = add_asm(d11_a, sum_a); - lane1[11] = add_asm(d11_b, sum_b); - } - } - lane0[0] = s0_a; - lane1[0] = s0_b; -} - -/// Split-state W16 internal permute: s0 stays in a register across all rounds. -#[inline] -pub fn internal_permute_state_asm_w16(state: &mut [u64; 16], constants: &[u64]) { - let mut s0 = state[0]; - for &rc in constants { - unsafe { - s0 = add_asm(s0, rc); - let s0_2 = mul_asm(s0, s0); - - let sum1 = add_asm(state[1], state[2]); - let sum2 = add_asm(state[3], state[4]); - let sum3 = add_asm(state[5], state[6]); - let sum4 = add_asm(state[7], state[8]); - let sum5 = add_asm(state[9], state[10]); - let sum6 = add_asm(state[11], state[12]); - let sum7 = add_asm(state[13], state[14]); - - let s0_3 = mul_asm(s0_2, s0); - let s0_4 = mul_asm(s0_2, s0_2); - - let sum12 = add_asm(sum1, sum2); - let sum34 = add_asm(sum3, sum4); - let sum56 = add_asm(sum5, sum6); - let sum715 = add_asm(sum7, state[15]); - - let sum1234 = add_asm(sum12, sum34); - let sum56715 = add_asm(sum56, sum715); - let sum_hi = add_asm(sum1234, sum56715); - - let d1 = state[1]; - let d2 = double_asm(state[2]); - let d3 = div2_asm(state[3]); - let d4 = add_asm(double_asm(state[4]), state[4]); - let d5 = double_asm(double_asm(state[5])); - let d6 = div2_asm(state[6]); - let d7 = add_asm(double_asm(state[7]), state[7]); - let d8 = double_asm(double_asm(state[8])); - - let d9 = div8_asm(state[9]); - let d10 = div16_asm(state[10]); - let d11 = div32_asm(state[11]); - let d12 = div8_asm(state[12]); - let d13 = div16_asm(state[13]); - let d14 = div32_asm(state[14]); - let d15 = div_2_32_asm(state[15]); - - s0 = mul_asm(s0_3, s0_4); - let sum = add_asm(sum_hi, s0); - s0 = sub_asm(sum_hi, s0); - - state[1] = add_asm(d1, sum); - state[2] = add_asm(d2, sum); - state[3] = add_asm(d3, sum); - state[4] = add_asm(d4, sum); - state[5] = add_asm(d5, sum); - state[6] = sub_asm(sum, d6); - state[7] = sub_asm(sum, d7); - state[8] = sub_asm(sum, d8); - state[9] = add_asm(d9, sum); - state[10] = add_asm(d10, sum); - state[11] = add_asm(d11, sum); - state[12] = sub_asm(sum, d12); - state[13] = sub_asm(sum, d13); - state[14] = sub_asm(sum, d14); - state[15] = add_asm(d15, sum); - } - } - state[0] = s0; -} - -/// Split-state dual-lane W16 internal permute for packed processing. -#[inline] -pub fn internal_permute_split_dual_w16( - lane0: &mut [u64; 16], - lane1: &mut [u64; 16], - constants: &[u64], -) { - let mut s0_a = lane0[0]; - let mut s0_b = lane1[0]; - for &rc in constants { - unsafe { - s0_a = add_asm(s0_a, rc); - s0_b = add_asm(s0_b, rc); - - let s0_2_a = mul_asm(s0_a, s0_a); - let s0_2_b = mul_asm(s0_b, s0_b); - - let sum1_a = add_asm(lane0[1], lane0[2]); - let sum1_b = add_asm(lane1[1], lane1[2]); - let sum2_a = add_asm(lane0[3], lane0[4]); - let sum2_b = add_asm(lane1[3], lane1[4]); - let sum3_a = add_asm(lane0[5], lane0[6]); - let sum3_b = add_asm(lane1[5], lane1[6]); - let sum4_a = add_asm(lane0[7], lane0[8]); - let sum4_b = add_asm(lane1[7], lane1[8]); - let sum5_a = add_asm(lane0[9], lane0[10]); - let sum5_b = add_asm(lane1[9], lane1[10]); - let sum6_a = add_asm(lane0[11], lane0[12]); - let sum6_b = add_asm(lane1[11], lane1[12]); - let sum7_a = add_asm(lane0[13], lane0[14]); - let sum7_b = add_asm(lane1[13], lane1[14]); - - let s0_3_a = mul_asm(s0_2_a, s0_a); - let s0_3_b = mul_asm(s0_2_b, s0_b); - let s0_4_a = mul_asm(s0_2_a, s0_2_a); - let s0_4_b = mul_asm(s0_2_b, s0_2_b); - - let sum12_a = add_asm(sum1_a, sum2_a); - let sum12_b = add_asm(sum1_b, sum2_b); - let sum34_a = add_asm(sum3_a, sum4_a); - let sum34_b = add_asm(sum3_b, sum4_b); - let sum56_a = add_asm(sum5_a, sum6_a); - let sum56_b = add_asm(sum5_b, sum6_b); - let sum715_a = add_asm(sum7_a, lane0[15]); - let sum715_b = add_asm(sum7_b, lane1[15]); - - let sum1234_a = add_asm(sum12_a, sum34_a); - let sum1234_b = add_asm(sum12_b, sum34_b); - let sum56715_a = add_asm(sum56_a, sum715_a); - let sum56715_b = add_asm(sum56_b, sum715_b); - let sum_hi_a = add_asm(sum1234_a, sum56715_a); - let sum_hi_b = add_asm(sum1234_b, sum56715_b); - - let d1_a = lane0[1]; - let d1_b = lane1[1]; - let d2_a = double_asm(lane0[2]); - let d2_b = double_asm(lane1[2]); - let d3_a = div2_asm(lane0[3]); - let d3_b = div2_asm(lane1[3]); - let d4_a = add_asm(double_asm(lane0[4]), lane0[4]); - let d4_b = add_asm(double_asm(lane1[4]), lane1[4]); - let d5_a = double_asm(double_asm(lane0[5])); - let d5_b = double_asm(double_asm(lane1[5])); - let d6_a = div2_asm(lane0[6]); - let d6_b = div2_asm(lane1[6]); - let d7_a = add_asm(double_asm(lane0[7]), lane0[7]); - let d7_b = add_asm(double_asm(lane1[7]), lane1[7]); - let d8_a = double_asm(double_asm(lane0[8])); - let d8_b = double_asm(double_asm(lane1[8])); - - let d9_a = div8_asm(lane0[9]); - let d9_b = div8_asm(lane1[9]); - let d10_a = div16_asm(lane0[10]); - let d10_b = div16_asm(lane1[10]); - let d11_a = div32_asm(lane0[11]); - let d11_b = div32_asm(lane1[11]); - let d12_a = div8_asm(lane0[12]); - let d12_b = div8_asm(lane1[12]); - let d13_a = div16_asm(lane0[13]); - let d13_b = div16_asm(lane1[13]); - let d14_a = div32_asm(lane0[14]); - let d14_b = div32_asm(lane1[14]); - let d15_a = div_2_32_asm(lane0[15]); - let d15_b = div_2_32_asm(lane1[15]); - - s0_a = mul_asm(s0_3_a, s0_4_a); - s0_b = mul_asm(s0_3_b, s0_4_b); - - let sum_a = add_asm(sum_hi_a, s0_a); - let sum_b = add_asm(sum_hi_b, s0_b); - s0_a = sub_asm(sum_hi_a, s0_a); - s0_b = sub_asm(sum_hi_b, s0_b); - - lane0[1] = add_asm(d1_a, sum_a); - lane1[1] = add_asm(d1_b, sum_b); - lane0[2] = add_asm(d2_a, sum_a); - lane1[2] = add_asm(d2_b, sum_b); - lane0[3] = add_asm(d3_a, sum_a); - lane1[3] = add_asm(d3_b, sum_b); - lane0[4] = add_asm(d4_a, sum_a); - lane1[4] = add_asm(d4_b, sum_b); - lane0[5] = add_asm(d5_a, sum_a); - lane1[5] = add_asm(d5_b, sum_b); - lane0[6] = sub_asm(sum_a, d6_a); - lane1[6] = sub_asm(sum_b, d6_b); - lane0[7] = sub_asm(sum_a, d7_a); - lane1[7] = sub_asm(sum_b, d7_b); - lane0[8] = sub_asm(sum_a, d8_a); - lane1[8] = sub_asm(sum_b, d8_b); - lane0[9] = add_asm(d9_a, sum_a); - lane1[9] = add_asm(d9_b, sum_b); - lane0[10] = add_asm(d10_a, sum_a); - lane1[10] = add_asm(d10_b, sum_b); - lane0[11] = add_asm(d11_a, sum_a); - lane1[11] = add_asm(d11_b, sum_b); - lane0[12] = sub_asm(sum_a, d12_a); - lane1[12] = sub_asm(sum_b, d12_b); - lane0[13] = sub_asm(sum_a, d13_a); - lane1[13] = sub_asm(sum_b, d13_b); - lane0[14] = sub_asm(sum_a, d14_a); - lane1[14] = sub_asm(sum_b, d14_b); - lane0[15] = add_asm(d15_a, sum_a); - lane1[15] = add_asm(d15_b, sum_b); - } - } - lane0[0] = s0_a; - lane1[0] = s0_b; -} - -// External layer: S-box on all elements, then MDS. Pipelined for latency hiding. - -/// Double a Goldilocks element. -#[inline(always)] -unsafe fn double_asm(a: u64) -> u64 { - // SAFETY: add_asm is safe with valid Goldilocks field elements - unsafe { add_asm(a, a) } -} - -/// 4x4 circulant MDS with coefficients [2,3,1,1]. -#[inline(always)] -unsafe fn apply_mat4_asm(x: &mut [u64; 4]) { - unsafe { - let t01 = add_asm(x[0], x[1]); - let t23 = add_asm(x[2], x[3]); - let t0123 = add_asm(t01, t23); - let t01123 = add_asm(t0123, x[1]); - let t01233 = add_asm(t0123, x[3]); - - let y3 = add_asm(t01233, double_asm(x[0])); - let y1 = add_asm(t01123, double_asm(x[2])); - let y0 = add_asm(t01123, t01); - let y2 = add_asm(t01233, t23); - - x[0] = y0; - x[1] = y1; - x[2] = y2; - x[3] = y3; - } -} - -/// Poseidon2 MDS light permutation: 4x4 blocks + outer sums. -#[inline(always)] -pub unsafe fn mds_light_permutation_asm(state: &mut [u64; WIDTH]) { - unsafe { - // Apply M_4 to each consecutive four elements - let mut i = 0; - while i < WIDTH { - let chunk: &mut [u64; 4] = (&mut state[i..i + 4]).try_into().unwrap(); - apply_mat4_asm(chunk); - i += 4; - } - - // Compute the four sums of every 4th element - let mut sums = [0u64; 4]; - for j in (0..WIDTH).step_by(4) { - sums[0] = add_asm(sums[0], state[j]); - sums[1] = add_asm(sums[1], state[j + 1]); - sums[2] = add_asm(sums[2], state[j + 2]); - sums[3] = add_asm(sums[3], state[j + 3]); - } - - // Add sums back to state - for (i, elem) in state.iter_mut().enumerate() { - *elem = add_asm(*elem, sums[i % 4]); - } - } -} - -/// Pipelined S-box computation for all elements. -/// Computes x^7 for all elements by interleaving stages to hide latency. -#[inline(always)] -pub unsafe fn sbox_layer_asm(state: &mut [u64; WIDTH]) { - unsafe { - // Stage 1: Compute all x^2 values - let mut x2 = [0u64; WIDTH]; - for i in 0..WIDTH { - x2[i] = mul_asm(state[i], state[i]); - } - - // Stage 2: Compute x^3 and x^4 values interleaved - // x^3 = x^2 * x, x^4 = x^2 * x^2 - let mut x3 = [0u64; WIDTH]; - let mut x4 = [0u64; WIDTH]; - for i in 0..WIDTH { - x3[i] = mul_asm(x2[i], state[i]); - x4[i] = mul_asm(x2[i], x2[i]); - } - - // Stage 3: Compute x^7 = x^3 * x^4 - for i in 0..WIDTH { - state[i] = mul_asm(x3[i], x4[i]); - } - } -} - -/// Optimized external round: add RC, S-box, MDS. -#[inline(always)] -pub unsafe fn external_round_asm(state: &mut [u64; WIDTH], rc: &[u64; WIDTH]) { - unsafe { - // Add round constants - for i in 0..WIDTH { - state[i] = add_asm(state[i], rc[i]); - } - - // Apply S-box (x^7) to all elements - sbox_layer_asm(state); - - // Apply MDS light permutation - mds_light_permutation_asm(state); - } -} - -/// Interleaved dual-lane S-box layer for better ILP. -#[inline(always)] -pub unsafe fn sbox_layer_dual_asm( - state0: &mut [u64; WIDTH], - state1: &mut [u64; WIDTH], -) { - unsafe { - // Stage 1: Compute all x^2 values for both lanes (interleaved) - let mut x2_a = [0u64; WIDTH]; - let mut x2_b = [0u64; WIDTH]; - for i in 0..WIDTH { - x2_a[i] = mul_asm(state0[i], state0[i]); - x2_b[i] = mul_asm(state1[i], state1[i]); - } - - // Stage 2: Compute x^3 and x^4 for both lanes (interleaved) - let mut x3_a = [0u64; WIDTH]; - let mut x3_b = [0u64; WIDTH]; - let mut x4_a = [0u64; WIDTH]; - let mut x4_b = [0u64; WIDTH]; - for i in 0..WIDTH { - x3_a[i] = mul_asm(x2_a[i], state0[i]); - x3_b[i] = mul_asm(x2_b[i], state1[i]); - x4_a[i] = mul_asm(x2_a[i], x2_a[i]); - x4_b[i] = mul_asm(x2_b[i], x2_b[i]); - } - - // Stage 3: Compute x^7 = x^3 * x^4 for both lanes - for i in 0..WIDTH { - state0[i] = mul_asm(x3_a[i], x4_a[i]); - state1[i] = mul_asm(x3_b[i], x4_b[i]); - } - } -} - -/// Interleaved dual-lane external round for better ILP. -#[inline(always)] -pub unsafe fn external_round_dual_asm( - state0: &mut [u64; WIDTH], - state1: &mut [u64; WIDTH], - rc: &[u64; WIDTH], -) { - unsafe { - // Add round constants (interleaved) - for i in 0..WIDTH { - state0[i] = add_asm(state0[i], rc[i]); - state1[i] = add_asm(state1[i], rc[i]); - } - - // Apply S-box (interleaved dual-lane) - sbox_layer_dual_asm(state0, state1); - - // Apply MDS (sequential - MDS is mostly additions which are fast) - mds_light_permutation_asm(state0); - mds_light_permutation_asm(state1); - } -} - -/// Fully unrolled and fused external round for W8. -#[inline(always)] -pub unsafe fn external_round_fused_w8(state: &mut [u64; 8], rc: &[u64; 8]) { - unsafe { - let s0 = add_asm(state[0], rc[0]); - let s1 = add_asm(state[1], rc[1]); - let x2_0 = mul_asm(s0, s0); - let x2_1 = mul_asm(s1, s1); - - let s2 = add_asm(state[2], rc[2]); - let s3 = add_asm(state[3], rc[3]); - let x2_2 = mul_asm(s2, s2); - let x2_3 = mul_asm(s3, s3); - - let s4 = add_asm(state[4], rc[4]); - let s5 = add_asm(state[5], rc[5]); - let x2_4 = mul_asm(s4, s4); - let x2_5 = mul_asm(s5, s5); - - let s6 = add_asm(state[6], rc[6]); - let s7 = add_asm(state[7], rc[7]); - let x2_6 = mul_asm(s6, s6); - let x2_7 = mul_asm(s7, s7); - - let x3_0 = mul_asm(x2_0, s0); - let x3_1 = mul_asm(x2_1, s1); - let x4_0 = mul_asm(x2_0, x2_0); - let x4_1 = mul_asm(x2_1, x2_1); - let x3_2 = mul_asm(x2_2, s2); - let x3_3 = mul_asm(x2_3, s3); - let x4_2 = mul_asm(x2_2, x2_2); - let x4_3 = mul_asm(x2_3, x2_3); - let x3_4 = mul_asm(x2_4, s4); - let x3_5 = mul_asm(x2_5, s5); - let x4_4 = mul_asm(x2_4, x2_4); - let x4_5 = mul_asm(x2_5, x2_5); - let x3_6 = mul_asm(x2_6, s6); - let x3_7 = mul_asm(x2_7, s7); - let x4_6 = mul_asm(x2_6, x2_6); - let x4_7 = mul_asm(x2_7, x2_7); - - state[0] = mul_asm(x3_0, x4_0); - state[1] = mul_asm(x3_1, x4_1); - state[2] = mul_asm(x3_2, x4_2); - state[3] = mul_asm(x3_3, x4_3); - state[4] = mul_asm(x3_4, x4_4); - state[5] = mul_asm(x3_5, x4_5); - state[6] = mul_asm(x3_6, x4_6); - state[7] = mul_asm(x3_7, x4_7); - - mds_light_permutation_asm(state); - } -} - -/// Fully unrolled and fused dual-lane external round for W8. -#[inline(always)] -pub unsafe fn external_round_fused_dual_w8( - state0: &mut [u64; 8], - state1: &mut [u64; 8], - rc: &[u64; 8], -) { - unsafe { - // Half 1: elements 0-3 across both lanes - let s0_a = add_asm(state0[0], rc[0]); - let s0_b = add_asm(state1[0], rc[0]); - let s1_a = add_asm(state0[1], rc[1]); - let s1_b = add_asm(state1[1], rc[1]); - let s2_a = add_asm(state0[2], rc[2]); - let s2_b = add_asm(state1[2], rc[2]); - let s3_a = add_asm(state0[3], rc[3]); - let s3_b = add_asm(state1[3], rc[3]); - - let x2_0a = mul_asm(s0_a, s0_a); - let x2_0b = mul_asm(s0_b, s0_b); - let x2_1a = mul_asm(s1_a, s1_a); - let x2_1b = mul_asm(s1_b, s1_b); - let x2_2a = mul_asm(s2_a, s2_a); - let x2_2b = mul_asm(s2_b, s2_b); - let x2_3a = mul_asm(s3_a, s3_a); - let x2_3b = mul_asm(s3_b, s3_b); - - let x3_0a = mul_asm(x2_0a, s0_a); - let x3_0b = mul_asm(x2_0b, s0_b); - let x4_0a = mul_asm(x2_0a, x2_0a); - let x4_0b = mul_asm(x2_0b, x2_0b); - let x3_1a = mul_asm(x2_1a, s1_a); - let x3_1b = mul_asm(x2_1b, s1_b); - let x4_1a = mul_asm(x2_1a, x2_1a); - let x4_1b = mul_asm(x2_1b, x2_1b); - let x3_2a = mul_asm(x2_2a, s2_a); - let x3_2b = mul_asm(x2_2b, s2_b); - let x4_2a = mul_asm(x2_2a, x2_2a); - let x4_2b = mul_asm(x2_2b, x2_2b); - let x3_3a = mul_asm(x2_3a, s3_a); - let x3_3b = mul_asm(x2_3b, s3_b); - let x4_3a = mul_asm(x2_3a, x2_3a); - let x4_3b = mul_asm(x2_3b, x2_3b); - - state0[0] = mul_asm(x3_0a, x4_0a); - state1[0] = mul_asm(x3_0b, x4_0b); - state0[1] = mul_asm(x3_1a, x4_1a); - state1[1] = mul_asm(x3_1b, x4_1b); - state0[2] = mul_asm(x3_2a, x4_2a); - state1[2] = mul_asm(x3_2b, x4_2b); - state0[3] = mul_asm(x3_3a, x4_3a); - state1[3] = mul_asm(x3_3b, x4_3b); - - // Half 2: elements 4-7 across both lanes - let s4_a = add_asm(state0[4], rc[4]); - let s4_b = add_asm(state1[4], rc[4]); - let s5_a = add_asm(state0[5], rc[5]); - let s5_b = add_asm(state1[5], rc[5]); - let s6_a = add_asm(state0[6], rc[6]); - let s6_b = add_asm(state1[6], rc[6]); - let s7_a = add_asm(state0[7], rc[7]); - let s7_b = add_asm(state1[7], rc[7]); - - let x2_4a = mul_asm(s4_a, s4_a); - let x2_4b = mul_asm(s4_b, s4_b); - let x2_5a = mul_asm(s5_a, s5_a); - let x2_5b = mul_asm(s5_b, s5_b); - let x2_6a = mul_asm(s6_a, s6_a); - let x2_6b = mul_asm(s6_b, s6_b); - let x2_7a = mul_asm(s7_a, s7_a); - let x2_7b = mul_asm(s7_b, s7_b); - - let x3_4a = mul_asm(x2_4a, s4_a); - let x3_4b = mul_asm(x2_4b, s4_b); - let x4_4a = mul_asm(x2_4a, x2_4a); - let x4_4b = mul_asm(x2_4b, x2_4b); - let x3_5a = mul_asm(x2_5a, s5_a); - let x3_5b = mul_asm(x2_5b, s5_b); - let x4_5a = mul_asm(x2_5a, x2_5a); - let x4_5b = mul_asm(x2_5b, x2_5b); - let x3_6a = mul_asm(x2_6a, s6_a); - let x3_6b = mul_asm(x2_6b, s6_b); - let x4_6a = mul_asm(x2_6a, x2_6a); - let x4_6b = mul_asm(x2_6b, x2_6b); - let x3_7a = mul_asm(x2_7a, s7_a); - let x3_7b = mul_asm(x2_7b, s7_b); - let x4_7a = mul_asm(x2_7a, x2_7a); - let x4_7b = mul_asm(x2_7b, x2_7b); - - state0[4] = mul_asm(x3_4a, x4_4a); - state1[4] = mul_asm(x3_4b, x4_4b); - state0[5] = mul_asm(x3_5a, x4_5a); - state1[5] = mul_asm(x3_5b, x4_5b); - state0[6] = mul_asm(x3_6a, x4_6a); - state1[6] = mul_asm(x3_6b, x4_6b); - state0[7] = mul_asm(x3_7a, x4_7a); - state1[7] = mul_asm(x3_7b, x4_7b); - - mds_light_permutation_asm(state0); - mds_light_permutation_asm(state1); - } -} - -/// Run initial external rounds with pre-converted raw u64 constants. -#[inline] -pub fn external_initial_permute_state_asm( - state: &mut [u64; WIDTH], - initial_constants: &[[u64; WIDTH]], -) { - unsafe { - mds_light_permutation_asm(state); - } - for rc in initial_constants { - unsafe { - external_round_asm(state, rc); - } - } -} - -/// Run terminal external rounds with pre-converted raw u64 constants. -#[inline] -pub fn external_terminal_permute_state_asm( - state: &mut [u64; WIDTH], - terminal_constants: &[[u64; WIDTH]], -) { - for rc in terminal_constants { - unsafe { - external_round_asm(state, rc); - } - } -} - -/// W8-specialized initial external permute using fused rounds. -#[inline] -pub fn external_initial_permute_w8(state: &mut [u64; 8], initial_constants: &[[u64; 8]]) { - unsafe { - mds_light_permutation_asm(state); - } - for rc in initial_constants { - unsafe { - external_round_fused_w8(state, rc); - } - } -} - -/// W8-specialized terminal external permute using fused rounds. -#[inline] -pub fn external_terminal_permute_w8(state: &mut [u64; 8], terminal_constants: &[[u64; 8]]) { - for rc in terminal_constants { - unsafe { - external_round_fused_w8(state, rc); - } - } -} - -/// Dual-lane initial external permute with pre-converted constants. -#[inline] -pub fn external_initial_permute_dual( - lane0: &mut [u64; WIDTH], - lane1: &mut [u64; WIDTH], - constants: &[[u64; WIDTH]], -) { - unsafe { - mds_light_permutation_asm(lane0); - mds_light_permutation_asm(lane1); - } - for rc in constants { - unsafe { - external_round_dual_asm(lane0, lane1, rc); - } - } -} - -/// Dual-lane terminal external permute with pre-converted constants. -#[inline] -pub fn external_terminal_permute_dual( - lane0: &mut [u64; WIDTH], - lane1: &mut [u64; WIDTH], - constants: &[[u64; WIDTH]], -) { - for rc in constants { - unsafe { - external_round_dual_asm(lane0, lane1, rc); - } - } -} - -/// W8-specialized dual-lane initial external permute using fused rounds. -#[inline] -pub fn external_initial_permute_dual_w8( - lane0: &mut [u64; 8], - lane1: &mut [u64; 8], - constants: &[[u64; 8]], -) { - unsafe { - mds_light_permutation_asm(lane0); - mds_light_permutation_asm(lane1); - } - for rc in constants { - unsafe { - external_round_fused_dual_w8(lane0, lane1, rc); - } - } -} - -/// W8-specialized dual-lane terminal external permute using fused rounds. -#[inline] -pub fn external_terminal_permute_dual_w8( - lane0: &mut [u64; 8], - lane1: &mut [u64; 8], - constants: &[[u64; 8]], -) { - for rc in constants { - unsafe { - external_round_fused_dual_w8(lane0, lane1, rc); - } - } -} - -// NEON 2-wide Goldilocks field primitives. -// Each operates on both packed lanes simultaneously using uint64x2_t. - -#[inline(always)] -unsafe fn add_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { - unsafe { - let res = vaddq_u64(a, b); - let overflow = vcgtq_u64(a, res); - let adj = vshrq_n_u64::<32>(overflow); - vaddq_u64(res, adj) - } -} - -#[inline(always)] -unsafe fn sub_neon(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { - unsafe { - let res = vsubq_u64(a, b); - let underflow = vcgtq_u64(b, a); - let adj = vshrq_n_u64::<32>(underflow); - vsubq_u64(res, adj) - } -} - -#[inline(always)] -unsafe fn double_neon(a: uint64x2_t) -> uint64x2_t { - unsafe { add_neon(a, a) } -} - -#[inline(always)] -unsafe fn div2_neon(x: uint64x2_t) -> uint64x2_t { - unsafe { - let half_p_plus_1 = vdupq_n_u64((P + 1) >> 1); - let one = vdupq_n_u64(1); - let is_odd = vandq_u64(x, one); - let half = vshrq_n_u64::<1>(x); - let mask = vtstq_u64(is_odd, is_odd); - let adj = vandq_u64(mask, half_p_plus_1); - vaddq_u64(half, adj) - } -} - -#[inline(always)] -unsafe fn div4_neon(x: uint64x2_t) -> uint64x2_t { - unsafe { div2_neon(div2_neon(x)) } -} - -#[inline(always)] -unsafe fn div8_neon(x: uint64x2_t) -> uint64x2_t { - unsafe { div2_neon(div4_neon(x)) } -} - -#[inline(always)] -unsafe fn div16_neon(x: uint64x2_t) -> uint64x2_t { - unsafe { div2_neon(div8_neon(x)) } -} - -#[inline(always)] -unsafe fn div32_neon(x: uint64x2_t) -> uint64x2_t { - unsafe { div4_neon(div8_neon(x)) } -} - -/// Compute x * 2^{-32} mod P for each lane using Goldilocks structure. -/// -/// Since P = 2^64 - 2^32 + 1, we have 2^{-32} ≡ 1 - 2^{32} (mod P). -/// So x * 2^{-32} = x_hi + x_lo - (x_lo << 32) mod P. -#[inline(always)] -unsafe fn div_2_32_neon(x: uint64x2_t) -> uint64x2_t { - unsafe { - let mask_32 = vdupq_n_u64(0xFFFFFFFF); - let hi = vshrq_n_u64::<32>(x); - let lo = vandq_u64(x, mask_32); - let sum = vaddq_u64(hi, lo); - let t = vshlq_n_u64::<32>(lo); - sub_neon(sum, t) - } -} - -#[inline(always)] -unsafe fn apply_mat4_neon(x: &mut [uint64x2_t; 4]) { - unsafe { - let t01 = add_neon(x[0], x[1]); - let t23 = add_neon(x[2], x[3]); - let t0123 = add_neon(t01, t23); - let t01123 = add_neon(t0123, x[1]); - let t01233 = add_neon(t0123, x[3]); - x[3] = add_neon(t01233, double_neon(x[0])); - x[1] = add_neon(t01123, double_neon(x[2])); - x[0] = add_neon(t01123, t01); - x[2] = add_neon(t01233, t23); - } -} - -#[inline(always)] -unsafe fn mds_light_neon(state: &mut [uint64x2_t; WIDTH]) { - unsafe { - let mut i = 0; - while i < WIDTH { - let chunk: &mut [uint64x2_t; 4] = (&mut state[i..i + 4]).try_into().unwrap(); - apply_mat4_neon(chunk); - i += 4; - } - let zero = vdupq_n_u64(0); - let mut sums = [zero; 4]; - for j in (0..WIDTH).step_by(4) { - sums[0] = add_neon(sums[0], state[j]); - sums[1] = add_neon(sums[1], state[j + 1]); - sums[2] = add_neon(sums[2], state[j + 2]); - sums[3] = add_neon(sums[3], state[j + 3]); - } - for (i, elem) in state.iter_mut().enumerate() { - *elem = add_neon(*elem, sums[i % 4]); - } - } -} - -/// Convert separate lane arrays into NEON vector array. -#[inline] -pub fn lanes_to_neon( - lane0: &[u64; WIDTH], - lane1: &[u64; WIDTH], -) -> [uint64x2_t; WIDTH] { - core::array::from_fn(|i| unsafe { - let lo = vcreate_u64(lane0[i]); - let hi = vcreate_u64(lane1[i]); - vcombine_u64(lo, hi) - }) -} - -/// Convert NEON vector array back to separate lane arrays. -#[inline] -pub fn neon_to_lanes( - state_v: &[uint64x2_t; WIDTH], - lane0: &mut [u64; WIDTH], - lane1: &mut [u64; WIDTH], -) { - for i in 0..WIDTH { - unsafe { - lane0[i] = vgetq_lane_u64::<0>(state_v[i]); - lane1[i] = vgetq_lane_u64::<1>(state_v[i]); - } - } -} - -// NEON-based internal permutation: both packed lanes processed -// simultaneously via uint64x2_t for sum tree, diagonal, and writeback. - -#[inline] -pub fn internal_permute_neon_w12(state: &mut [uint64x2_t; 12], constants: &[u64]) { - let mut s0 = state[0]; - for &rc in constants { - unsafe { - let rc_vec = vdupq_n_u64(rc); - s0 = add_neon(s0, rc_vec); - - let s0_0 = vgetq_lane_u64::<0>(s0); - let s0_1 = vgetq_lane_u64::<1>(s0); - let s0_2_0 = mul_asm(s0_0, s0_0); - let s0_2_1 = mul_asm(s0_1, s0_1); - - let sum1 = add_neon(state[1], state[2]); - let sum2 = add_neon(state[3], state[4]); - let sum3 = add_neon(state[5], state[6]); - let sum4 = add_neon(state[7], state[8]); - let sum5 = add_neon(state[9], state[10]); - - let s0_3_0 = mul_asm(s0_2_0, s0_0); - let s0_3_1 = mul_asm(s0_2_1, s0_1); - let s0_4_0 = mul_asm(s0_2_0, s0_2_0); - let s0_4_1 = mul_asm(s0_2_1, s0_2_1); - - let sum12 = add_neon(sum1, sum2); - let sum34 = add_neon(sum3, sum4); - let sum511 = add_neon(sum5, state[11]); - - let d1 = state[1]; - let d2 = double_neon(state[2]); - let d3 = div2_neon(state[3]); - let d4 = add_neon(double_neon(state[4]), state[4]); - - let sum1234 = add_neon(sum12, sum34); - - let d5 = double_neon(double_neon(state[5])); - let d6 = div2_neon(state[6]); - let d7 = add_neon(double_neon(state[7]), state[7]); - let d8 = double_neon(double_neon(state[8])); - - let sum_hi = add_neon(sum1234, sum511); - - let d9 = div4_neon(state[9]); - let d10 = div4_neon(state[10]); - let d11 = div8_neon(state[11]); - - let s0_7_0 = mul_asm(s0_3_0, s0_4_0); - let s0_7_1 = mul_asm(s0_3_1, s0_4_1); - let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1)); - - let sum = add_neon(sum_hi, s0_7); - s0 = sub_neon(sum_hi, s0_7); - - state[1] = add_neon(d1, sum); - state[2] = add_neon(d2, sum); - state[3] = add_neon(d3, sum); - state[4] = add_neon(d4, sum); - state[5] = add_neon(d5, sum); - state[6] = sub_neon(sum, d6); - state[7] = sub_neon(sum, d7); - state[8] = sub_neon(sum, d8); - state[9] = add_neon(d9, sum); - state[10] = sub_neon(sum, d10); - state[11] = add_neon(d11, sum); - } - } - state[0] = s0; -} - -#[inline] -pub fn internal_permute_neon_w16(state: &mut [uint64x2_t; 16], constants: &[u64]) { - let mut s0 = state[0]; - for &rc in constants { - unsafe { - let rc_vec = vdupq_n_u64(rc); - s0 = add_neon(s0, rc_vec); - - let s0_0 = vgetq_lane_u64::<0>(s0); - let s0_1 = vgetq_lane_u64::<1>(s0); - let s0_2_0 = mul_asm(s0_0, s0_0); - let s0_2_1 = mul_asm(s0_1, s0_1); - - let sum1 = add_neon(state[1], state[2]); - let sum2 = add_neon(state[3], state[4]); - let sum3 = add_neon(state[5], state[6]); - let sum4 = add_neon(state[7], state[8]); - let sum5 = add_neon(state[9], state[10]); - let sum6 = add_neon(state[11], state[12]); - let sum7 = add_neon(state[13], state[14]); - - let s0_3_0 = mul_asm(s0_2_0, s0_0); - let s0_3_1 = mul_asm(s0_2_1, s0_1); - let s0_4_0 = mul_asm(s0_2_0, s0_2_0); - let s0_4_1 = mul_asm(s0_2_1, s0_2_1); - - let sum12 = add_neon(sum1, sum2); - let sum34 = add_neon(sum3, sum4); - let sum56 = add_neon(sum5, sum6); - let sum715 = add_neon(sum7, state[15]); - - let sum1234 = add_neon(sum12, sum34); - let sum56715 = add_neon(sum56, sum715); - let sum_hi = add_neon(sum1234, sum56715); - - let d1 = state[1]; - let d2 = double_neon(state[2]); - let d3 = div2_neon(state[3]); - let d4 = add_neon(double_neon(state[4]), state[4]); - let d5 = double_neon(double_neon(state[5])); - let d6 = div2_neon(state[6]); - let d7 = add_neon(double_neon(state[7]), state[7]); - let d8 = double_neon(double_neon(state[8])); - - let d9 = div8_neon(state[9]); - let d10 = div16_neon(state[10]); - let d11 = div32_neon(state[11]); - let d12 = div8_neon(state[12]); - let d13 = div16_neon(state[13]); - let d14 = div32_neon(state[14]); - let d15 = div_2_32_neon(state[15]); - - let s0_7_0 = mul_asm(s0_3_0, s0_4_0); - let s0_7_1 = mul_asm(s0_3_1, s0_4_1); - let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1)); - - let sum = add_neon(sum_hi, s0_7); - s0 = sub_neon(sum_hi, s0_7); - - state[1] = add_neon(d1, sum); - state[2] = add_neon(d2, sum); - state[3] = add_neon(d3, sum); - state[4] = add_neon(d4, sum); - state[5] = add_neon(d5, sum); - state[6] = sub_neon(sum, d6); - state[7] = sub_neon(sum, d7); - state[8] = sub_neon(sum, d8); - state[9] = add_neon(d9, sum); - state[10] = add_neon(d10, sum); - state[11] = add_neon(d11, sum); - state[12] = sub_neon(sum, d12); - state[13] = sub_neon(sum, d13); - state[14] = sub_neon(sum, d14); - state[15] = add_neon(d15, sum); - } - } - state[0] = s0; -} - -#[inline] -pub fn internal_permute_neon( - state: &mut [uint64x2_t; WIDTH], - diag: &[u64; WIDTH], - constants: &[u64], -) { - let mut s0 = state[0]; - for &rc in constants { - unsafe { - let rc_vec = vdupq_n_u64(rc); - s0 = add_neon(s0, rc_vec); - - let s0_0 = vgetq_lane_u64::<0>(s0); - let s0_1 = vgetq_lane_u64::<1>(s0); - let s0_2_0 = mul_asm(s0_0, s0_0); - let s0_2_1 = mul_asm(s0_1, s0_1); - let s0_3_0 = mul_asm(s0_2_0, s0_0); - let s0_3_1 = mul_asm(s0_2_1, s0_1); - let s0_4_0 = mul_asm(s0_2_0, s0_2_0); - let s0_4_1 = mul_asm(s0_2_1, s0_2_1); - let s0_7_0 = mul_asm(s0_3_0, s0_4_0); - let s0_7_1 = mul_asm(s0_3_1, s0_4_1); - let s0_7 = vcombine_u64(vcreate_u64(s0_7_0), vcreate_u64(s0_7_1)); - - let zero = vdupq_n_u64(0); - let mut sum_hi = zero; - for &s in state.iter().skip(1) { - sum_hi = add_neon(sum_hi, s); - } - - let sum = add_neon(sum_hi, s0_7); - s0 = vcombine_u64( - vcreate_u64(mul_add_asm(s0_7_0, diag[0], vgetq_lane_u64::<0>(sum))), - vcreate_u64(mul_add_asm(s0_7_1, diag[0], vgetq_lane_u64::<1>(sum))), - ); - - for i in 1..WIDTH { - let s_0 = mul_add_asm( - vgetq_lane_u64::<0>(state[i]), - diag[i], - vgetq_lane_u64::<0>(sum), - ); - let s_1 = mul_add_asm( - vgetq_lane_u64::<1>(state[i]), - diag[i], - vgetq_lane_u64::<1>(sum), - ); - state[i] = vcombine_u64(vcreate_u64(s_0), vcreate_u64(s_1)); - } - } - } - state[0] = s0; -} - -// NEON-based external round: S-box stays scalar, MDS uses NEON. - -#[inline(always)] -unsafe fn sbox_neon(state: &mut [uint64x2_t; WIDTH]) { - unsafe { - let mut x2_0 = [0u64; WIDTH]; - let mut x2_1 = [0u64; WIDTH]; - for i in 0..WIDTH { - let a = vgetq_lane_u64::<0>(state[i]); - let b = vgetq_lane_u64::<1>(state[i]); - x2_0[i] = mul_asm(a, a); - x2_1[i] = mul_asm(b, b); - } - let mut x3_0 = [0u64; WIDTH]; - let mut x3_1 = [0u64; WIDTH]; - let mut x4_0 = [0u64; WIDTH]; - let mut x4_1 = [0u64; WIDTH]; - for i in 0..WIDTH { - let a = vgetq_lane_u64::<0>(state[i]); - let b = vgetq_lane_u64::<1>(state[i]); - x3_0[i] = mul_asm(x2_0[i], a); - x3_1[i] = mul_asm(x2_1[i], b); - x4_0[i] = mul_asm(x2_0[i], x2_0[i]); - x4_1[i] = mul_asm(x2_1[i], x2_1[i]); - } - for i in 0..WIDTH { - let r0 = mul_asm(x3_0[i], x4_0[i]); - let r1 = mul_asm(x3_1[i], x4_1[i]); - state[i] = vcombine_u64(vcreate_u64(r0), vcreate_u64(r1)); - } - } -} - -#[inline(always)] -unsafe fn external_round_neon( - state: &mut [uint64x2_t; WIDTH], - rc: &[u64; WIDTH], -) { - unsafe { - for i in 0..WIDTH { - let rc_vec = vdupq_n_u64(rc[i]); - state[i] = add_neon(state[i], rc_vec); - } - sbox_neon(state); - mds_light_neon(state); - } -} - -/// NEON initial external permute. -#[inline] -pub fn external_initial_neon( - state: &mut [uint64x2_t; WIDTH], - constants: &[[u64; WIDTH]], -) { - unsafe { - mds_light_neon(state); - } - for rc in constants { - unsafe { - external_round_neon(state, rc); - } - } -} - -/// NEON terminal external permute. -#[inline] -pub fn external_terminal_neon( - state: &mut [uint64x2_t; WIDTH], - constants: &[[u64; WIDTH]], -) { - for rc in constants { - unsafe { - external_round_neon(state, rc); - } - } -} - -#[cfg(test)] -mod tests { - use alloc::vec::Vec; - - use p3_field::{PrimeCharacteristicRing, PrimeField64}; - use p3_poseidon2::{MDSMat4, matmul_internal, mds_light_permutation}; - use proptest::prelude::*; - use rand::rngs::SmallRng; - use rand::{RngExt, SeedableRng}; - - use super::*; - use crate::{ - Goldilocks, MATRIX_DIAG_8_GOLDILOCKS, MATRIX_DIAG_12_GOLDILOCKS, MATRIX_DIAG_16_GOLDILOCKS, - MATRIX_DIAG_20_GOLDILOCKS, - }; - - type F = Goldilocks; - - /// Reduce a raw u64 to its canonical Goldilocks representative. - fn canon(x: u64) -> u64 { - F::new(x).as_canonical_u64() - } - - /// Pack two u64 lanes into a single NEON vector. - unsafe fn make_neon(a: u64, b: u64) -> uint64x2_t { - unsafe { vcombine_u64(vcreate_u64(a), vcreate_u64(b)) } - } - - /// Extract both u64 lanes from a NEON vector. - unsafe fn read_neon(v: uint64x2_t) -> (u64, u64) { - unsafe { (vgetq_lane_u64::<0>(v), vgetq_lane_u64::<1>(v)) } - } - - proptest! { - #[test] - fn test_sub_asm(a: u64, b: u64) { - // Compute a - b using the standard field implementation. - let expected = (F::new(a) - F::new(b)).as_canonical_u64(); - - // The ASM version should give the same canonical result. - let got = canon(unsafe { sub_asm(a, b) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_double_asm(a: u64) { - // Doubling is just a + a in the field. - let expected = (F::new(a) + F::new(a)).as_canonical_u64(); - - // The ASM shortcut should match. - let got = canon(unsafe { double_asm(a) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_div2_asm(x: u64) { - // Dividing by 2 is one halving in the field. - let expected = F::new(x).halve().as_canonical_u64(); - - let got = canon(unsafe { div2_asm(x) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_div4_asm(x: u64) { - // Dividing by 4 is two halvings. - let expected = F::new(x).halve().halve().as_canonical_u64(); - - let got = canon(unsafe { div4_asm(x) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_div8_asm(x: u64) { - // Dividing by 8 is three halvings. - let expected = F::new(x).halve().halve().halve().as_canonical_u64(); - - let got = canon(unsafe { div8_asm(x) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_div16_asm(x: u64) { - // Dividing by 16 is four halvings. - let expected = F::new(x).halve().halve().halve().halve().as_canonical_u64(); - - let got = canon(unsafe { div16_asm(x) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_div32_asm(x: u64) { - // Dividing by 32 is five halvings. - let expected = F::new(x) - .halve().halve().halve().halve().halve() - .as_canonical_u64(); - - let got = canon(unsafe { div32_asm(x) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_div_2_32_asm(x: u64) { - // Dividing by 2^32: apply halve 32 times as reference. - let mut v = F::new(x); - for _ in 0..32 { - v = v.halve(); - } - let expected = v.as_canonical_u64(); - - let got = canon(unsafe { div_2_32_asm(x) }); - prop_assert_eq!(got, expected); - } - - #[test] - fn test_apply_mat4_asm(x0: u64, x1: u64, x2: u64, x3: u64) { - // Build field elements from the raw inputs. - let f = [F::new(x0), F::new(x1), F::new(x2), F::new(x3)]; - - // The [2,3,1,1] circulant matrix rows. - let two = F::TWO; - let three = two + F::ONE; - let e0 = two * f[0] + three * f[1] + f[2] + f[3]; - let e1 = f[0] + two * f[1] + three * f[2] + f[3]; - let e2 = f[0] + f[1] + two * f[2] + three * f[3]; - let e3 = three * f[0] + f[1] + f[2] + two * f[3]; - - // Run the ASM version on raw u64s. - let mut state = [x0, x1, x2, x3]; - unsafe { apply_mat4_asm(&mut state); } - - // Each slot must match the field-level reference. - prop_assert_eq!(canon(state[0]), e0.as_canonical_u64()); - prop_assert_eq!(canon(state[1]), e1.as_canonical_u64()); - prop_assert_eq!(canon(state[2]), e2.as_canonical_u64()); - prop_assert_eq!(canon(state[3]), e3.as_canonical_u64()); - } - - #[test] - fn test_mds_light_permutation_asm_w8(vals in prop::array::uniform8(any::())) { - // Build field-level state and apply the generic MDS. - let mut state_generic: [F; 8] = vals.map(F::new); - mds_light_permutation(&mut state_generic, &MDSMat4); - - // Run the ASM version on the same raw values. - let mut state_asm = vals; - unsafe { mds_light_permutation_asm(&mut state_asm); } - - // Every element must agree. - for i in 0..8 { - prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64()); - } - } - - #[test] - fn test_mds_light_permutation_asm_w12(vals in prop::array::uniform12(any::())) { - let mut state_generic: [F; 12] = vals.map(F::new); - mds_light_permutation(&mut state_generic, &MDSMat4); - - let mut state_asm = vals; - unsafe { mds_light_permutation_asm(&mut state_asm); } - - for i in 0..12 { - prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64()); - } - } - - #[test] - fn test_mds_light_permutation_asm_w16(vals in prop::array::uniform16(any::())) { - let mut state_generic: [F; 16] = vals.map(F::new); - mds_light_permutation(&mut state_generic, &MDSMat4); - - let mut state_asm = vals; - unsafe { mds_light_permutation_asm(&mut state_asm); } - - for i in 0..16 { - prop_assert_eq!(canon(state_asm[i]), state_generic[i].as_canonical_u64()); - } - } - - #[test] - fn test_sbox_layer_asm(vals in prop::array::uniform8(any::())) { - // Apply the ASM S-box to a copy of the input. - let mut state = vals; - unsafe { sbox_layer_asm(&mut state); } - - // Verify each element is x^7 = x^3 * x^4. - for i in 0..8 { - let x = F::new(vals[i]); - let x2 = x * x; - let x3 = x2 * x; - let x4 = x2 * x2; - let x7 = x3 * x4; - prop_assert_eq!(canon(state[i]), x7.as_canonical_u64()); - } - } - - #[test] - fn test_external_round_asm( - vals in prop::array::uniform8(any::()), - rc in prop::array::uniform8(any::()), - ) { - // Build reference: add round constants, apply x^7, then MDS. - let mut expected: [F; 8] = core::array::from_fn(|i| F::new(vals[i]) + F::new(rc[i])); - for x in expected.iter_mut() { - let x2 = *x * *x; - let x3 = x2 * *x; - let x4 = x2 * x2; - *x = x3 * x4; - } - mds_light_permutation(&mut expected, &MDSMat4); - - // Run the ASM external round. - let mut state = vals; - unsafe { external_round_asm(&mut state, &rc); } - - for i in 0..8 { - prop_assert_eq!(canon(state[i]), expected[i].as_canonical_u64()); - } - } - - #[test] - fn test_sbox_layer_dual_asm( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - ) { - // Run sbox on each lane independently as reference. - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - sbox_layer_asm(&mut ref0); - sbox_layer_asm(&mut ref1); - } - - // The dual-lane version processes both at once. - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { sbox_layer_dual_asm(&mut s0, &mut s1); } - - // Both lanes must match their single-lane reference. - for i in 0..8 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - - #[test] - fn test_external_round_dual_asm( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - rc in prop::array::uniform8(any::()), - ) { - // Run external round on each lane independently as reference. - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - external_round_asm(&mut ref0, &rc); - external_round_asm(&mut ref1, &rc); - } - - // The dual-lane version processes both at once. - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { external_round_dual_asm(&mut s0, &mut s1, &rc); } - - for i in 0..8 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - - #[test] - fn test_external_round_fused_w8( - vals in prop::array::uniform8(any::()), - rc in prop::array::uniform8(any::()), - ) { - // The generic external round is the reference. - let mut ref_state = vals; - unsafe { external_round_asm(&mut ref_state, &rc); } - - // The fused W8 version should produce the same output. - let mut fused_state = vals; - unsafe { external_round_fused_w8(&mut fused_state, &rc); } - - for i in 0..8 { - prop_assert_eq!(canon(fused_state[i]), canon(ref_state[i])); - } - } - - #[test] - fn test_external_round_fused_dual_w8( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - rc in prop::array::uniform8(any::()), - ) { - // Run the fused round on each lane independently as reference. - let mut ref0 = vals0; - let mut ref1 = vals1; - unsafe { - external_round_fused_w8(&mut ref0, &rc); - external_round_fused_w8(&mut ref1, &rc); - } - - // The dual version processes both at once. - let mut s0 = vals0; - let mut s1 = vals1; - unsafe { external_round_fused_dual_w8(&mut s0, &mut s1, &rc); } - - for i in 0..8 { - prop_assert_eq!(canon(s0[i]), canon(ref0[i])); - prop_assert_eq!(canon(s1[i]), canon(ref1[i])); - } - } - } - - fn test_internal_round_matches(diag: [F; WIDTH]) { - let mut rng = SmallRng::seed_from_u64(12345); - - // Build random state and constants. - let mut state_asm: [F; WIDTH] = rng.random(); - let mut state_generic = state_asm; - - let internal_constants: [F; 22] = rng.random(); - let constants_raw: Vec = internal_constants.iter().map(|c| c.value).collect(); - let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); - - // Run the ASM internal permute on raw u64 representation. - let state_raw: &mut [u64; WIDTH] = - unsafe { &mut *(&mut state_asm as *mut [F; WIDTH] as *mut [u64; WIDTH]) }; - internal_permute_state_asm(state_raw, &diag_raw, &constants_raw); - - // Build the same result via field-level ops: add RC, S-box on s0, matmul. - for &rc in internal_constants.iter() { - state_generic[0] += rc; - let s = state_generic[0]; - let s2 = s * s; - let s3 = s2 * s; - let s4 = s2 * s2; - state_generic[0] = s3 * s4; - matmul_internal(&mut state_generic, diag); - } - - for i in 0..WIDTH { - assert_eq!( - state_asm[i].as_canonical_u64(), - state_generic[i].as_canonical_u64(), - "mismatch at index {i}" - ); - } - } - - #[test] - fn test_internal_round_width_8() { - test_internal_round_matches(MATRIX_DIAG_8_GOLDILOCKS); - } - - #[test] - fn test_internal_round_width_12() { - test_internal_round_matches(MATRIX_DIAG_12_GOLDILOCKS); - } - - #[test] - fn test_internal_round_width_16() { - test_internal_round_matches(MATRIX_DIAG_16_GOLDILOCKS); - } - - #[test] - fn test_internal_round_width_20() { - test_internal_round_matches(MATRIX_DIAG_20_GOLDILOCKS); - } - - fn test_specialized_matches_generic( - diag: [F; WIDTH], - specialized_fn: fn(&mut [u64; WIDTH], &[u64]), - ) { - let mut rng = SmallRng::seed_from_u64(42); - - let internal_constants: Vec = (0..22).map(|_| rng.random()).collect(); - let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); - - // Run both the specialized and generic versions on several random states. - for _ in 0..8 { - let mut state_specialized: [u64; WIDTH] = rng.random(); - let mut state_generic = state_specialized; - - specialized_fn(&mut state_specialized, &internal_constants); - internal_permute_state_asm(&mut state_generic, &diag_raw, &internal_constants); - - for i in 0..WIDTH { - assert_eq!(canon(state_specialized[i]), canon(state_generic[i])); - } - } - } - - #[test] - fn test_specialized_w8_matches_generic() { - test_specialized_matches_generic(MATRIX_DIAG_8_GOLDILOCKS, internal_permute_state_asm_w8); - } - - #[test] - fn test_specialized_w12_matches_generic() { - test_specialized_matches_generic(MATRIX_DIAG_12_GOLDILOCKS, internal_permute_state_asm_w12); - } - - #[test] - fn test_specialized_w16_matches_generic() { - test_specialized_matches_generic(MATRIX_DIAG_16_GOLDILOCKS, internal_permute_state_asm_w16); - } - - #[allow(clippy::type_complexity)] - fn test_dual_matches_single( - diag: [F; WIDTH], - single_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]), - dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64; WIDTH], &[u64]), - ) { - let mut rng = SmallRng::seed_from_u64(77); - - let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); - let constants: Vec = (0..22).map(|_| rng.random()).collect(); - - // Run single-lane on each lane independently. - let mut lane0: [u64; WIDTH] = rng.random(); - let mut lane1: [u64; WIDTH] = rng.random(); - let mut ref0 = lane0; - let mut ref1 = lane1; - - single_fn(&mut ref0, &diag_raw, &constants); - single_fn(&mut ref1, &diag_raw, &constants); - - // Run dual-lane on both at once. Must match. - dual_fn(&mut lane0, &mut lane1, &diag_raw, &constants); - - for i in 0..WIDTH { - assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}"); - assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}"); - } - } - - #[test] - fn test_internal_permute_split_dual_w8() { - test_dual_matches_single( - MATRIX_DIAG_8_GOLDILOCKS, - internal_permute_state_asm, - internal_permute_split_dual, - ); - } - - #[test] - fn test_internal_permute_split_dual_w12() { - test_dual_matches_single( - MATRIX_DIAG_12_GOLDILOCKS, - internal_permute_state_asm, - internal_permute_split_dual, - ); - } - - #[test] - fn test_internal_permute_split_dual_w16() { - test_dual_matches_single( - MATRIX_DIAG_16_GOLDILOCKS, - internal_permute_state_asm, - internal_permute_split_dual, - ); - } - - fn test_specialized_dual_matches_generic_dual( - diag: [F; WIDTH], - specialized_dual_fn: fn(&mut [u64; WIDTH], &mut [u64; WIDTH], &[u64]), - ) { - let mut rng = SmallRng::seed_from_u64(99); - - let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); - let constants: Vec = (0..22).map(|_| rng.random()).collect(); - - // The generic dual-lane version is the reference. - let mut lane0: [u64; WIDTH] = rng.random(); - let mut lane1: [u64; WIDTH] = rng.random(); - let mut ref0 = lane0; - let mut ref1 = lane1; - - internal_permute_split_dual(&mut ref0, &mut ref1, &diag_raw, &constants); - - // The specialized version must match. - specialized_dual_fn(&mut lane0, &mut lane1, &constants); - - for i in 0..WIDTH { - assert_eq!(canon(lane0[i]), canon(ref0[i]), "lane0 mismatch at {i}"); - assert_eq!(canon(lane1[i]), canon(ref1[i]), "lane1 mismatch at {i}"); - } - } - - #[test] - fn test_specialized_dual_w8_matches_generic() { - test_specialized_dual_matches_generic_dual( - MATRIX_DIAG_8_GOLDILOCKS, - internal_permute_split_dual_w8, - ); - } - - #[test] - fn test_specialized_dual_w12_matches_generic() { - test_specialized_dual_matches_generic_dual( - MATRIX_DIAG_12_GOLDILOCKS, - internal_permute_split_dual_w12, - ); - } - - #[test] - fn test_specialized_dual_w16_matches_generic() { - test_specialized_dual_matches_generic_dual( - MATRIX_DIAG_16_GOLDILOCKS, - internal_permute_split_dual_w16, - ); - } - - fn make_round_constants(seed: u64, num_rounds: usize) -> Vec<[u64; WIDTH]> { - let mut rng = SmallRng::seed_from_u64(seed); - (0..num_rounds).map(|_| rng.random()).collect() - } - - proptest! { - #[test] - fn test_external_initial_permute_state_asm( - vals in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(42, 4); - - // Reference: apply MDS once, then each external round manually. - let mut expected = vals; - unsafe { mds_light_permutation_asm(&mut expected); } - for rc in &constants { - unsafe { external_round_asm(&mut expected, rc); } - } - - // The composed function should give the same result. - let mut got = vals; - external_initial_permute_state_asm(&mut got, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(got[i]), canon(expected[i])); - } - } - - #[test] - fn test_external_terminal_permute_state_asm( - vals in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(43, 4); - - // Reference: just the external rounds, no initial MDS. - let mut expected = vals; - for rc in &constants { - unsafe { external_round_asm(&mut expected, rc); } - } - - let mut got = vals; - external_terminal_permute_state_asm(&mut got, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(got[i]), canon(expected[i])); - } - } - - #[test] - fn test_external_initial_permute_w8( - vals in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(44, 4); - - // The generic version is the reference. - let mut expected = vals; - external_initial_permute_state_asm(&mut expected, &constants); - - // The W8-specialized version must match. - let mut got = vals; - external_initial_permute_w8(&mut got, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(got[i]), canon(expected[i])); - } - } - - #[test] - fn test_external_terminal_permute_w8( - vals in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(45, 4); - - let mut expected = vals; - external_terminal_permute_state_asm(&mut expected, &constants); - - let mut got = vals; - external_terminal_permute_w8(&mut got, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(got[i]), canon(expected[i])); - } - } - - #[test] - fn test_external_initial_permute_dual( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(46, 4); - - // Run single-lane on each independently as reference. - let mut ref0 = vals0; - let mut ref1 = vals1; - external_initial_permute_state_asm(&mut ref0, &constants); - external_initial_permute_state_asm(&mut ref1, &constants); - - // The dual version processes both at once. - let mut l0 = vals0; - let mut l1 = vals1; - external_initial_permute_dual(&mut l0, &mut l1, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(l0[i]), canon(ref0[i])); - prop_assert_eq!(canon(l1[i]), canon(ref1[i])); - } - } - - #[test] - fn test_external_terminal_permute_dual( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(47, 4); - - let mut ref0 = vals0; - let mut ref1 = vals1; - external_terminal_permute_state_asm(&mut ref0, &constants); - external_terminal_permute_state_asm(&mut ref1, &constants); - - let mut l0 = vals0; - let mut l1 = vals1; - external_terminal_permute_dual(&mut l0, &mut l1, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(l0[i]), canon(ref0[i])); - prop_assert_eq!(canon(l1[i]), canon(ref1[i])); - } - } - - #[test] - fn test_external_initial_permute_dual_w8( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(48, 4); - - // The generic dual version is the reference. - let mut ref0 = vals0; - let mut ref1 = vals1; - external_initial_permute_dual(&mut ref0, &mut ref1, &constants); - - // The W8-specialized dual must match. - let mut l0 = vals0; - let mut l1 = vals1; - external_initial_permute_dual_w8(&mut l0, &mut l1, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(l0[i]), canon(ref0[i])); - prop_assert_eq!(canon(l1[i]), canon(ref1[i])); - } - } - - #[test] - fn test_external_terminal_permute_dual_w8( - vals0 in prop::array::uniform8(any::()), - vals1 in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(49, 4); - - let mut ref0 = vals0; - let mut ref1 = vals1; - external_terminal_permute_dual(&mut ref0, &mut ref1, &constants); - - let mut l0 = vals0; - let mut l1 = vals1; - external_terminal_permute_dual_w8(&mut l0, &mut l1, &constants); - - for i in 0..8 { - prop_assert_eq!(canon(l0[i]), canon(ref0[i])); - prop_assert_eq!(canon(l1[i]), canon(ref1[i])); - } - } - - #[test] - fn test_add_neon(a0: u64, a1: u64, b0: u64, b1: u64) { - unsafe { - // Pack two lanes into NEON vectors, add, then read back. - let (r0, r1) = read_neon(add_neon(make_neon(a0, a1), make_neon(b0, b1))); - - // Each lane must match its scalar add_asm equivalent. - prop_assert_eq!(canon(r0), canon(add_asm(a0, b0))); - prop_assert_eq!(canon(r1), canon(add_asm(a1, b1))); - } - } - - #[test] - fn test_sub_neon(a0: u64, a1: u64, b0: u64, b1: u64) { - unsafe { - let (r0, r1) = read_neon(sub_neon(make_neon(a0, a1), make_neon(b0, b1))); - - prop_assert_eq!(canon(r0), canon(sub_asm(a0, b0))); - prop_assert_eq!(canon(r1), canon(sub_asm(a1, b1))); - } - } - - #[test] - fn test_double_neon(a0: u64, a1: u64) { - unsafe { - let (r0, r1) = read_neon(double_neon(make_neon(a0, a1))); - - prop_assert_eq!(canon(r0), canon(double_asm(a0))); - prop_assert_eq!(canon(r1), canon(double_asm(a1))); - } - } - - #[test] - fn test_div2_neon(a0: u64, a1: u64) { - unsafe { - let (r0, r1) = read_neon(div2_neon(make_neon(a0, a1))); - - prop_assert_eq!(canon(r0), canon(div2_asm(a0))); - prop_assert_eq!(canon(r1), canon(div2_asm(a1))); - } - } - - #[test] - fn test_div4_neon(a0: u64, a1: u64) { - unsafe { - let (r0, r1) = read_neon(div4_neon(make_neon(a0, a1))); - - prop_assert_eq!(canon(r0), canon(div4_asm(a0))); - prop_assert_eq!(canon(r1), canon(div4_asm(a1))); - } - } - - #[test] - fn test_div8_neon(a0: u64, a1: u64) { - unsafe { - let (r0, r1) = read_neon(div8_neon(make_neon(a0, a1))); - - prop_assert_eq!(canon(r0), canon(div8_asm(a0))); - prop_assert_eq!(canon(r1), canon(div8_asm(a1))); - } - } - - #[test] - fn test_div16_neon(a0: u64, a1: u64) { - unsafe { - let (r0, r1) = read_neon(div16_neon(make_neon(a0, a1))); - - prop_assert_eq!(canon(r0), canon(div16_asm(a0))); - prop_assert_eq!(canon(r1), canon(div16_asm(a1))); - } - } - - #[test] - fn test_div32_neon(a0: u64, a1: u64) { - unsafe { - let (r0, r1) = read_neon(div32_neon(make_neon(a0, a1))); - - prop_assert_eq!(canon(r0), canon(div32_asm(a0))); - prop_assert_eq!(canon(r1), canon(div32_asm(a1))); - } - } - - #[test] - fn test_div_2_32_neon(a0: u64, a1: u64) { - unsafe { - let (r0, r1) = read_neon(div_2_32_neon(make_neon(a0, a1))); - - prop_assert_eq!(canon(r0), canon(div_2_32_asm(a0))); - prop_assert_eq!(canon(r1), canon(div_2_32_asm(a1))); - } - } - - #[test] - fn test_apply_mat4_neon( - a0: u64, a1: u64, a2: u64, a3: u64, - b0: u64, b1: u64, b2: u64, b3: u64, - ) { - unsafe { - // Scalar reference: run apply_mat4_asm on each lane separately. - let mut lane_a = [a0, a1, a2, a3]; - let mut lane_b = [b0, b1, b2, b3]; - apply_mat4_asm(&mut lane_a); - apply_mat4_asm(&mut lane_b); - - // NEON version: pack both lanes into vectors, apply, read back. - let mut neon_state = [ - make_neon(a0, b0), - make_neon(a1, b1), - make_neon(a2, b2), - make_neon(a3, b3), - ]; - apply_mat4_neon(&mut neon_state); - - for i in 0..4 { - let (r0, r1) = read_neon(neon_state[i]); - prop_assert_eq!(canon(r0), canon(lane_a[i])); - prop_assert_eq!(canon(r1), canon(lane_b[i])); - } - } - } - - #[test] - fn test_mds_light_neon_w8( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - unsafe { - // Run scalar MDS on each lane independently. - let mut ref_a = lane_a; - let mut ref_b = lane_b; - mds_light_permutation_asm(&mut ref_a); - mds_light_permutation_asm(&mut ref_b); - - // Pack both lanes into NEON vectors and run the NEON MDS. - let mut neon_state: [uint64x2_t; 8] = - core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i])); - mds_light_neon(&mut neon_state); - - // Each lane of each vector must match the scalar reference. - for i in 0..8 { - let (r0, r1) = read_neon(neon_state[i]); - prop_assert_eq!(canon(r0), canon(ref_a[i])); - prop_assert_eq!(canon(r1), canon(ref_b[i])); - } - } - } - - #[test] - fn test_sbox_neon( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - unsafe { - // Scalar reference on each lane. - let mut ref_a = lane_a; - let mut ref_b = lane_b; - sbox_layer_asm(&mut ref_a); - sbox_layer_asm(&mut ref_b); - - // NEON version on packed lanes. - let mut neon_state: [uint64x2_t; 8] = - core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i])); - sbox_neon(&mut neon_state); - - for i in 0..8 { - let (r0, r1) = read_neon(neon_state[i]); - prop_assert_eq!(canon(r0), canon(ref_a[i])); - prop_assert_eq!(canon(r1), canon(ref_b[i])); - } - } - } - - #[test] - fn test_external_round_neon( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - rc in prop::array::uniform8(any::()), - ) { - unsafe { - // Scalar reference on each lane. - let mut ref_a = lane_a; - let mut ref_b = lane_b; - external_round_asm(&mut ref_a, &rc); - external_round_asm(&mut ref_b, &rc); - - // NEON version on packed lanes. - let mut neon_state: [uint64x2_t; 8] = - core::array::from_fn(|i| make_neon(lane_a[i], lane_b[i])); - external_round_neon(&mut neon_state, &rc); - - for i in 0..8 { - let (r0, r1) = read_neon(neon_state[i]); - prop_assert_eq!(canon(r0), canon(ref_a[i])); - prop_assert_eq!(canon(r1), canon(ref_b[i])); - } - } - } - - #[test] - fn test_lanes_roundtrip( - lane0 in prop::array::uniform8(any::()), - lane1 in prop::array::uniform8(any::()), - ) { - // Pack two lane arrays into NEON vectors. - let packed = lanes_to_neon(&lane0, &lane1); - - // Unpack back into separate arrays. - let mut out0 = [0u64; 8]; - let mut out1 = [0u64; 8]; - neon_to_lanes(&packed, &mut out0, &mut out1); - - // Must recover the original values. - prop_assert_eq!(out0, lane0); - prop_assert_eq!(out1, lane1); - } - - #[test] - fn test_external_initial_neon( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(50, 4); - - // Scalar reference on each lane. - let mut ref_a = lane_a; - let mut ref_b = lane_b; - external_initial_permute_state_asm(&mut ref_a, &constants); - external_initial_permute_state_asm(&mut ref_b, &constants); - - // NEON version on packed lanes. - let mut neon_state = lanes_to_neon(&lane_a, &lane_b); - external_initial_neon(&mut neon_state, &constants); - - let mut out_a = [0u64; 8]; - let mut out_b = [0u64; 8]; - neon_to_lanes(&neon_state, &mut out_a, &mut out_b); - - for i in 0..8 { - prop_assert_eq!(canon(out_a[i]), canon(ref_a[i])); - prop_assert_eq!(canon(out_b[i]), canon(ref_b[i])); - } - } - - #[test] - fn test_external_terminal_neon( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - let constants = make_round_constants::<8>(51, 4); - - let mut ref_a = lane_a; - let mut ref_b = lane_b; - external_terminal_permute_state_asm(&mut ref_a, &constants); - external_terminal_permute_state_asm(&mut ref_b, &constants); - - let mut neon_state = lanes_to_neon(&lane_a, &lane_b); - external_terminal_neon(&mut neon_state, &constants); - - let mut out_a = [0u64; 8]; - let mut out_b = [0u64; 8]; - neon_to_lanes(&neon_state, &mut out_a, &mut out_b); - - for i in 0..8 { - prop_assert_eq!(canon(out_a[i]), canon(ref_a[i])); - prop_assert_eq!(canon(out_b[i]), canon(ref_b[i])); - } - } - } - - fn test_internal_neon_matches_scalar( - diag: [F; WIDTH], - neon_fn: fn(&mut [uint64x2_t; WIDTH], &[u64]), - scalar_fn: fn(&mut [u64; WIDTH], &[u64; WIDTH], &[u64]), - ) { - let mut rng = SmallRng::seed_from_u64(55); - - let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); - let constants: Vec = (0..22).map(|_| rng.random()).collect(); - - let lane_a: [u64; WIDTH] = rng.random(); - let lane_b: [u64; WIDTH] = rng.random(); - - // Scalar reference on each lane independently. - let mut ref_a = lane_a; - let mut ref_b = lane_b; - scalar_fn(&mut ref_a, &diag_raw, &constants); - scalar_fn(&mut ref_b, &diag_raw, &constants); - - // NEON version packs both lanes and processes them together. - let mut neon_state = lanes_to_neon(&lane_a, &lane_b); - neon_fn(&mut neon_state, &constants); - - let mut out_a = [0u64; WIDTH]; - let mut out_b = [0u64; WIDTH]; - neon_to_lanes(&neon_state, &mut out_a, &mut out_b); - - for i in 0..WIDTH { - assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}"); - assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}"); - } - } - - #[test] - fn test_internal_permute_neon_w12() { - test_internal_neon_matches_scalar( - MATRIX_DIAG_12_GOLDILOCKS, - internal_permute_neon_w12, - internal_permute_state_asm, - ); - } - - #[test] - fn test_internal_permute_neon_w16() { - test_internal_neon_matches_scalar( - MATRIX_DIAG_16_GOLDILOCKS, - internal_permute_neon_w16, - internal_permute_state_asm, - ); - } - - fn test_internal_neon_generic_matches_scalar(diag: [F; WIDTH]) { - let mut rng = SmallRng::seed_from_u64(66); - - let diag_raw: [u64; WIDTH] = core::array::from_fn(|i| diag[i].value); - let constants: Vec = (0..22).map(|_| rng.random()).collect(); - - let lane_a: [u64; WIDTH] = rng.random(); - let lane_b: [u64; WIDTH] = rng.random(); - - // Scalar reference. - let mut ref_a = lane_a; - let mut ref_b = lane_b; - internal_permute_state_asm(&mut ref_a, &diag_raw, &constants); - internal_permute_state_asm(&mut ref_b, &diag_raw, &constants); - - // Generic NEON version. - let mut neon_state = lanes_to_neon(&lane_a, &lane_b); - internal_permute_neon(&mut neon_state, &diag_raw, &constants); - - let mut out_a = [0u64; WIDTH]; - let mut out_b = [0u64; WIDTH]; - neon_to_lanes(&neon_state, &mut out_a, &mut out_b); - - for i in 0..WIDTH { - assert_eq!(canon(out_a[i]), canon(ref_a[i]), "lane0 mismatch at {i}"); - assert_eq!(canon(out_b[i]), canon(ref_b[i]), "lane1 mismatch at {i}"); - } - } - - #[test] - fn test_internal_permute_neon_generic_w8() { - test_internal_neon_generic_matches_scalar(MATRIX_DIAG_8_GOLDILOCKS); - } - - #[test] - fn test_internal_permute_neon_generic_w12() { - test_internal_neon_generic_matches_scalar(MATRIX_DIAG_12_GOLDILOCKS); - } - - #[test] - fn test_internal_permute_neon_generic_w16() { - test_internal_neon_generic_matches_scalar(MATRIX_DIAG_16_GOLDILOCKS); - } - - #[test] - fn test_internal_permute_neon_generic_w20() { - test_internal_neon_generic_matches_scalar(MATRIX_DIAG_20_GOLDILOCKS); - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs deleted file mode 100644 index 3d1951a57..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/aarch64_neon/utils.rs +++ /dev/null @@ -1,400 +0,0 @@ -//! Shared utilities for Goldilocks NEON assembly. - -use core::arch::asm; - -use super::packing::PackedGoldilocksNeon; -use crate::{Goldilocks, P}; - -const EPSILON: u64 = P.wrapping_neg(); // 2^32 - 1 - -// --------------------------------------------------------------------------- -// Scalar field arithmetic (inline assembly) -// --------------------------------------------------------------------------- - -/// Multiply two Goldilocks elements using inline assembly. -/// -/// Computes `a * b mod P` where P = 2^64 - 2^32 + 1. The reduction -/// uses the identity `2^64 = 2^32 - 1 (mod P)` (i.e. EPSILON) to fold -/// the 128-bit product back into a single limb. -#[inline(always)] -pub(super) unsafe fn mul_asm(a: u64, b: u64) -> u64 { - let _lo: u64; - let _hi: u64; - let _t0: u64; - let _t1: u64; - let _t2: u64; - let result: u64; - - unsafe { - asm!( - // Compute 128-bit product: hi:lo = a * b - "mul {lo}, {a}, {b}", - "umulh {hi}, {a}, {b}", - - // Reduce: result = lo - hi_hi + hi_lo * EPSILON - // where hi = hi_hi * 2^32 + hi_lo - - // t0 = lo - (hi >> 32), with borrow detection - "lsr {t0}, {hi}, #32", // t0 = hi >> 32 - "subs {t1}, {lo}, {t0}", // t1 = lo - t0, set flags - "csetm {t2:w}, cc", // t2 = -1 if borrow, 0 otherwise - "sub {t1}, {t1}, {t2}", // Adjust for borrow (subtract EPSILON) - - // t0 = (hi & EPSILON) * EPSILON - "and {t0}, {hi}, {epsilon}", // t0 = hi & EPSILON - "mul {t0}, {t0}, {epsilon}", // t0 = t0 * EPSILON - - // result = t1 + t0, with overflow detection - "adds {result}, {t1}, {t0}", // result = t1 + t0, set flags - "csetm {t2:w}, cs", // t2 = -1 if carry, 0 otherwise - "add {result}, {result}, {t2}", // Add EPSILON on overflow - - a = in(reg) a, - b = in(reg) b, - epsilon = in(reg) EPSILON, - lo = out(reg) _lo, - hi = out(reg) _hi, - t0 = out(reg) _t0, - t1 = out(reg) _t1, - t2 = out(reg) _t2, - result = out(reg) result, - options(pure, nomem, nostack), - ); - } - - result -} - -/// Compute `a * b + c` in the Goldilocks field using inline assembly. -/// -/// Fused multiply-add: forms the 128-bit product `a * b`, adds `c` into -/// the low limb (with carry propagation), then reduces modulo P. -#[inline(always)] -pub(super) unsafe fn mul_add_asm(a: u64, b: u64, c: u64) -> u64 { - let _lo: u64; - let _hi: u64; - let _t0: u64; - let _t1: u64; - let _t2: u64; - let result: u64; - - unsafe { - asm!( - // Compute 128-bit product: hi:lo = a * b - "mul {lo}, {a}, {b}", - "umulh {hi}, {a}, {b}", - - // Accumulate c into the 128-bit product: hi:lo = hi:lo + c - "adds {lo}, {lo}, {c}", - "adc {hi}, {hi}, xzr", - - // Reduce: result = lo - hi_hi + hi_lo * EPSILON - // where hi = hi_hi * 2^32 + hi_lo - - // t0 = lo - (hi >> 32), with borrow detection - "lsr {t0}, {hi}, #32", // t0 = hi >> 32 - "subs {t1}, {lo}, {t0}", // t1 = lo - t0, set flags - "csetm {t2:w}, cc", // t2 = -1 if borrow, 0 otherwise - "sub {t1}, {t1}, {t2}", // Adjust for borrow (subtract EPSILON) - - // t0 = (hi & EPSILON) * EPSILON - "and {t0}, {hi}, {epsilon}", // t0 = hi & EPSILON - "mul {t0}, {t0}, {epsilon}", // t0 = t0 * EPSILON - - // result = t1 + t0, with overflow detection - "adds {result}, {t1}, {t0}", // result = t1 + t0, set flags - "csetm {t2:w}, cs", // t2 = -1 if carry, 0 otherwise - "add {result}, {result}, {t2}", // Add EPSILON on overflow - - a = in(reg) a, - b = in(reg) b, - c = in(reg) c, - epsilon = in(reg) EPSILON, - lo = out(reg) _lo, - hi = out(reg) _hi, - t0 = out(reg) _t0, - t1 = out(reg) _t1, - t2 = out(reg) _t2, - result = out(reg) result, - options(pure, nomem, nostack), - ); - } - - result -} - -/// Add two Goldilocks elements with overflow handling using inline assembly. -/// -/// Computes `a + b mod P`. On overflow (carry out of 64 bits), subtracts -/// P by adding EPSILON (which equals -P mod 2^64, i.e. 2^32 - 1). -#[inline(always)] -pub(super) unsafe fn add_asm(a: u64, b: u64) -> u64 { - let result: u64; - let _adj: u64; - - unsafe { - asm!( - "adds {result}, {a}, {b}", - "csetm {adj:w}, cs", - "add {result}, {result}, {adj}", - a = in(reg) a, - b = in(reg) b, - result = out(reg) result, - adj = out(reg) _adj, - options(pure, nomem, nostack), - ); - } - - result -} - -// --------------------------------------------------------------------------- -// Lane conversion (packed NEON <-> raw u64 arrays) -// --------------------------------------------------------------------------- - -/// Unpack a packed NEON state into two raw `u64` lane arrays. -/// -/// Each packed slot contains two Goldilocks elements (lane 0, lane 1). -/// This function extracts the internal `u64` representation of each -/// element into two separate arrays, one per lane. -/// -/// # Layout -/// -/// ```text -/// packed[i] = (field_elem_a, field_elem_b) -/// -/// lane0[i] = field_elem_a.value (raw u64) -/// lane1[i] = field_elem_b.value (raw u64) -/// ``` -#[inline] -pub(super) fn unpack_lanes( - state: &[PackedGoldilocksNeon; WIDTH], -) -> ([u64; WIDTH], [u64; WIDTH]) { - // Extract the raw u64 representation from each packed slot. - let lane0: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[0].value); - let lane1: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[1].value); - (lane0, lane1) -} - -/// Pack two raw `u64` lane arrays back into a packed NEON state. -/// -/// Each raw value is wrapped into a Goldilocks field element (with -/// reduction modulo P) and paired into a packed slot. -/// -/// # Layout -/// -/// ```text -/// lane0[i], lane1[i] -> packed[i] = (Goldilocks(lane0[i]), Goldilocks(lane1[i])) -/// ``` -#[inline] -pub(super) fn pack_lanes( - state: &mut [PackedGoldilocksNeon; WIDTH], - lane0: &[u64; WIDTH], - lane1: &[u64; WIDTH], -) { - for i in 0..WIDTH { - // Wrap each raw u64 into a field element and pair them. - state[i] = PackedGoldilocksNeon([Goldilocks::new(lane0[i]), Goldilocks::new(lane1[i])]); - } -} - -#[cfg(test)] -mod tests { - use p3_field::{PrimeCharacteristicRing, PrimeField64}; - use proptest::prelude::*; - - use super::*; - - type F = Goldilocks; - - /// Reduce a raw `u64` to its canonical Goldilocks representative. - fn canon(x: u64) -> u64 { - F::new(x).as_canonical_u64() - } - - proptest! { - // ---------------------------------------------------------------- - // Scalar field arithmetic - // ---------------------------------------------------------------- - - /// Verify ASM addition against field addition. - #[test] - fn test_add_asm(a: u64, b: u64) { - let expected = (F::new(a) + F::new(b)).as_canonical_u64(); - let got = canon(unsafe { add_asm(a, b) }); - prop_assert_eq!(got, expected); - } - - /// Verify ASM multiplication against field multiplication. - #[test] - fn test_mul_asm(a: u64, b: u64) { - let expected = (F::new(a) * F::new(b)).as_canonical_u64(); - let got = canon(unsafe { mul_asm(a, b) }); - prop_assert_eq!(got, expected); - } - - /// Verify ASM fused multiply-add against field multiply-add. - #[test] - fn test_mul_add_asm(a: u64, b: u64, c: u64) { - let expected = (F::new(a) * F::new(b) + F::new(c)).as_canonical_u64(); - let got = canon(unsafe { mul_add_asm(a, b, c) }); - prop_assert_eq!(got, expected); - } - - // ---------------------------------------------------------------- - // Unpack: packed state -> two raw u64 lane arrays - // ---------------------------------------------------------------- - - #[test] - fn test_unpack_lanes_w8( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - // Build a packed state from two independent lane arrays. - let packed: [PackedGoldilocksNeon; 8] = - core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); - - // Unpack into raw u64 lane arrays. - let (got0, got1) = unpack_lanes(&packed); - - // Each raw value must be the internal representation of the field element. - for i in 0..8 { - prop_assert_eq!(got0[i], F::new(lane_a[i]).value); - prop_assert_eq!(got1[i], F::new(lane_b[i]).value); - } - } - - #[test] - fn test_unpack_lanes_w12( - lane_a in prop::array::uniform12(any::()), - lane_b in prop::array::uniform12(any::()), - ) { - // Same verification, width 12. - let packed: [PackedGoldilocksNeon; 12] = - core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); - - let (got0, got1) = unpack_lanes(&packed); - - for i in 0..12 { - prop_assert_eq!(got0[i], F::new(lane_a[i]).value); - prop_assert_eq!(got1[i], F::new(lane_b[i]).value); - } - } - - // ---------------------------------------------------------------- - // Pack: two raw u64 lane arrays -> packed state - // ---------------------------------------------------------------- - - #[test] - fn test_pack_lanes_w8( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - // Pack two raw lane arrays into packed state. - let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8]; - pack_lanes(&mut packed, &lane_a, &lane_b); - - // Each packed element must hold the two corresponding field elements. - for i in 0..8 { - prop_assert_eq!(packed[i].0[0], F::new(lane_a[i])); - prop_assert_eq!(packed[i].0[1], F::new(lane_b[i])); - } - } - - #[test] - fn test_pack_lanes_w12( - lane_a in prop::array::uniform12(any::()), - lane_b in prop::array::uniform12(any::()), - ) { - // Same verification, width 12. - let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12]; - pack_lanes(&mut packed, &lane_a, &lane_b); - - for i in 0..12 { - prop_assert_eq!(packed[i].0[0], F::new(lane_a[i])); - prop_assert_eq!(packed[i].0[1], F::new(lane_b[i])); - } - } - - // ---------------------------------------------------------------- - // Roundtrip: pack then unpack recovers canonical values - // ---------------------------------------------------------------- - - #[test] - fn test_roundtrip_pack_unpack_w8( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - // Pack two lane arrays, then unpack them. - let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8]; - pack_lanes(&mut packed, &lane_a, &lane_b); - let (out0, out1) = unpack_lanes(&packed); - - // The canonical form of the recovered values must match. - for i in 0..8 { - prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64()); - prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64()); - } - } - - #[test] - fn test_roundtrip_pack_unpack_w12( - lane_a in prop::array::uniform12(any::()), - lane_b in prop::array::uniform12(any::()), - ) { - // Same roundtrip, width 12. - let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12]; - pack_lanes(&mut packed, &lane_a, &lane_b); - let (out0, out1) = unpack_lanes(&packed); - - for i in 0..12 { - prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64()); - prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64()); - } - } - - // ---------------------------------------------------------------- - // Roundtrip: unpack then pack preserves packed state - // ---------------------------------------------------------------- - - #[test] - fn test_roundtrip_unpack_pack_w8( - lane_a in prop::array::uniform8(any::()), - lane_b in prop::array::uniform8(any::()), - ) { - // Start from a packed state. - let original: [PackedGoldilocksNeon; 8] = - core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); - - // Unpack into raw lanes, then pack back. - let (raw0, raw1) = unpack_lanes(&original); - let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 8]; - pack_lanes(&mut restored, &raw0, &raw1); - - // The restored packed state must equal the original. - for i in 0..8 { - prop_assert_eq!(restored[i].0[0], original[i].0[0]); - prop_assert_eq!(restored[i].0[1], original[i].0[1]); - } - } - - #[test] - fn test_roundtrip_unpack_pack_w12( - lane_a in prop::array::uniform12(any::()), - lane_b in prop::array::uniform12(any::()), - ) { - // Same reverse roundtrip, width 12. - let original: [PackedGoldilocksNeon; 12] = - core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])])); - - let (raw0, raw1) = unpack_lanes(&original); - let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 12]; - pack_lanes(&mut restored, &raw0, &raw1); - - for i in 0..12 { - prop_assert_eq!(restored[i].0[0], original[i].0[0]); - prop_assert_eq!(restored[i].0[1], original[i].0[1]); - } - } - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs deleted file mode 100644 index 5ac38a28b..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/extension.rs +++ /dev/null @@ -1,217 +0,0 @@ -use p3_field::extension::{ - BinomiallyExtendable, BinomiallyExtendableAlgebra, HasTwoAdicBinomialExtension, -}; -use p3_field::{PrimeCharacteristicRing, TwoAdicField, field_to_array}; - -use crate::Goldilocks; - -impl BinomiallyExtendableAlgebra for Goldilocks {} - -impl BinomiallyExtendable<2> for Goldilocks { - // Verifiable in Sage with - // `R. = GF(p)[]; assert (x^2 - 7).is_irreducible()`. - const W: Self = Self::new(7); - - // DTH_ROOT = W^((p - 1)/2). - const DTH_ROOT: Self = Self::new(18446744069414584320); - - const EXT_GENERATOR: [Self; 2] = [ - Self::new(18081566051660590251), - Self::new(16121475356294670766), - ]; -} - -impl HasTwoAdicBinomialExtension<2> for Goldilocks { - const EXT_TWO_ADICITY: usize = 33; - - fn ext_two_adic_generator(bits: usize) -> [Self; 2] { - assert!(bits <= 33); - - if bits == 33 { - [Self::ZERO, Self::new(15659105665374529263)] - } else { - [Self::two_adic_generator(bits), Self::ZERO] - } - } -} - -impl BinomiallyExtendableAlgebra for Goldilocks {} - -impl BinomiallyExtendable<3> for Goldilocks { - // Verifiable in Sage with - // `R. = GF(p)[]; assert (x^3 - 2).is_irreducible()`. - // Same irreducible as Lambda's Degree3GoldilocksExtensionField. - const W: Self = Self::new(2); - - // DTH_ROOT = primitive 3rd root of unity = 7^((p-1)/3) mod p. - const DTH_ROOT: Self = Self::new(18446744065119617025); - - // Generator of GF(p^3)* = 5 + w. Verified: passes order checks for - // all small prime factors of p^3 - 1. - const EXT_GENERATOR: [Self; 3] = [Self::new(5), Self::ONE, Self::ZERO]; -} - -impl HasTwoAdicBinomialExtension<3> for Goldilocks { - // v_2(p^3 - 1) = v_2(p-1) + v_2(p^2+p+1) = 32 + 0 = 32. - const EXT_TWO_ADICITY: usize = 32; - - fn ext_two_adic_generator(bits: usize) -> [Self; 3] { - assert!(bits <= 32); - field_to_array(Self::two_adic_generator(bits)) - } -} - -impl BinomiallyExtendableAlgebra for Goldilocks {} - -impl BinomiallyExtendable<5> for Goldilocks { - // Verifiable via: - // ```sage - // # Define Fp - // p = 2**64 - 2**32 + 1 - // F = GF(p) - - // # Define Fp[z] - // R. = PolynomialRing(F) - - // # The polynomial x^5-3 is irreducible - // assert(R(z^5-3).is_irreducible()) - // ``` - const W: Self = Self::new(3); - - // 5-th root = w^((p - 1)/5) - const DTH_ROOT: Self = Self::new(1041288259238279555); - - // Generator of the extension field - // Obtained by finding the smallest Hamming weight vector - // with appropriate order, starting at [0,1,0,0,0] - const EXT_GENERATOR: [Self; 5] = [Self::TWO, Self::ONE, Self::ZERO, Self::ZERO, Self::ZERO]; -} - -impl HasTwoAdicBinomialExtension<5> for Goldilocks { - const EXT_TWO_ADICITY: usize = 32; - - fn ext_two_adic_generator(bits: usize) -> [Self; 5] { - assert!(bits <= 32); - - field_to_array(Self::two_adic_generator(bits)) - } -} - -#[cfg(test)] -mod test_quadratic_extension { - - use num_bigint::BigUint; - use p3_field::extension::BinomialExtensionField; - use p3_field::{ExtensionField, PrimeCharacteristicRing}; - use p3_field_testing::{ - test_extension_field, test_field, test_packed_extension_field, - test_two_adic_extension_field, - }; - - use crate::Goldilocks; - - type F = Goldilocks; - type EF = BinomialExtensionField; - - // There is a redundant representation of zero but we already tested it - // when testing the base field. - const ZEROS: [EF; 1] = [EF::ZERO]; - const ONES: [EF; 1] = [EF::ONE]; - - // Get the prime factorization of the order of the multiplicative group. - // i.e. the prime factorization of P^2 - 1. - fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 9] { - [ - (BigUint::from(2u8), 33), - (BigUint::from(3u8), 1), - (BigUint::from(5u8), 1), - (BigUint::from(7u8), 1), - (BigUint::from(17u8), 1), - (BigUint::from(179u8), 1), - (BigUint::from(257u16), 1), - (BigUint::from(65537u32), 1), - (BigUint::from(7361031152998637u64), 1), - ] - } - - test_field!( - super::EF, - &super::ZEROS, - &super::ONES, - &super::multiplicative_group_prime_factorization() - ); - - test_extension_field!(super::F, super::EF); - test_two_adic_extension_field!(super::F, super::EF); - - type Pef = >::ExtensionPacking; - const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO]; - const PACKED_ONES: [Pef; 1] = [Pef::ONE]; - test_packed_extension_field!( - super::F, - super::EF, - super::Pef, - &super::PACKED_ZEROS, - &super::PACKED_ONES - ); -} - -#[cfg(test)] -mod test_quintic_extension { - - use num_bigint::BigUint; - use p3_field::extension::BinomialExtensionField; - use p3_field::{ExtensionField, PrimeCharacteristicRing}; - use p3_field_testing::{ - test_extension_field, test_field, test_packed_extension_field, - test_two_adic_extension_field, - }; - - use crate::Goldilocks; - - type F = Goldilocks; - type EF = BinomialExtensionField; - - // There is a redundant representation of zero but we already tested it - // when testing the base field. - const ZEROS: [EF; 1] = [EF::ZERO]; - const ONES: [EF; 1] = [EF::ONE]; - - // Get the prime factorization of the order of the multiplicative group. - // i.e. the prime factorization of P^5 - 1. - fn multiplicative_group_prime_factorization() -> [(num_bigint::BigUint, u32); 10] { - [ - (BigUint::from(2u8), 32), - (BigUint::from(3u8), 1), - (BigUint::from(5u8), 2), - (BigUint::from(17u8), 1), - (BigUint::from(257u16), 1), - (BigUint::from(45971u16), 1), - (BigUint::from(65537u32), 1), - (BigUint::from(255006435240067831u64), 1), - (BigUint::from(280083648770327405561u128), 1), - (BigUint::from(7053197395277272939628824863222181u128), 1), - ] - } - - test_field!( - super::EF, - &super::ZEROS, - &super::ONES, - &super::multiplicative_group_prime_factorization() - ); - - test_extension_field!(super::F, super::EF); - test_two_adic_extension_field!(super::F, super::EF); - - type Pef = >::ExtensionPacking; - const PACKED_ZEROS: [Pef; 1] = [Pef::ZERO]; - const PACKED_ONES: [Pef; 1] = [Pef::ONE]; - test_packed_extension_field!( - super::F, - super::EF, - super::Pef, - &super::PACKED_ZEROS, - &super::PACKED_ONES - ); -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs deleted file mode 100644 index ebe3f8c7a..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/goldilocks.rs +++ /dev/null @@ -1,813 +0,0 @@ -use alloc::vec; -use alloc::vec::Vec; -use core::fmt::{Debug, Display, Formatter}; -use core::hash::{Hash, Hasher}; -use core::iter::{Product, Sum}; -use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; -use core::{array, fmt}; - -use num_bigint::BigUint; -use p3_challenger::UniformSamplingField; -use p3_field::exponentiation::exp_10540996611094048183; -use p3_field::integers::QuotientMap; -use p3_field::op_assign_macros::{ - impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign, -}; -use p3_field::{ - Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField, - PrimeField64, RawDataSerializable, TwoAdicField, halve_u64, impl_raw_serializable_primefield64, - quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int, -}; -use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner}; -use rand::Rng; -use rand::distr::{Distribution, StandardUniform}; -use serde::{Deserialize, Serialize}; - -/// The Goldilocks prime -pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001; - -/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`. -/// -/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64. -#[derive(Copy, Clone, Default, Serialize, Deserialize)] -#[repr(transparent)] // Important for reasoning about memory layout -#[must_use] -pub struct Goldilocks { - /// Not necessarily canonical. - pub(crate) value: u64, -} - -impl Goldilocks { - /// Create a new field element from any `u64`. - /// - /// Any `u64` value is accepted. No reduction is performed since - /// Goldilocks uses a non-canonical internal representation. - #[inline] - pub const fn new(value: u64) -> Self { - Self { value } - } - - /// Convert a `[u64; N]` array to an array of field elements. - /// - /// Const version of `input.map(Goldilocks::new)`. - #[inline] - pub const fn new_array(input: [u64; N]) -> [Self; N] { - let mut output = [Self::ZERO; N]; - let mut i = 0; - while i < N { - output[i].value = input[i]; - i += 1; - } - output - } - - /// Convert a `[[u64; N]; M]` array to a 2D array of field elements. - /// - /// Const version of `input.map(Goldilocks::new_array)`. - #[inline] - pub const fn new_2d_array( - input: [[u64; N]; M], - ) -> [[Self; N]; M] { - let mut output = [[Self::ZERO; N]; M]; - let mut i = 0; - while i < M { - output[i] = Self::new_array(input[i]); - i += 1; - } - output - } - - /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`. - const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg(); - - /// A list of generators for the two-adic subgroups of the goldilocks field. - /// - /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`. - pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([ - 0x0000000000000001, - 0xffffffff00000000, - 0x0001000000000000, - 0xfffffffeff000001, - 0xefffffff00000001, - 0x00003fffffffc000, - 0x0000008000000000, - 0xf80007ff08000001, - 0xbf79143ce60ca966, - 0x1905d02a5c411f4e, - 0x9d8f2ad78bfed972, - 0x0653b4801da1c8cf, - 0xf2c35199959dfcb6, - 0x1544ef2335d17997, - 0xe0ee099310bba1e2, - 0xf6b2cffe2306baac, - 0x54df9630bf79450e, - 0xabd0a6e8aa3d8a0e, - 0x81281a7b05f9beac, - 0xfbd41c6b8caa3302, - 0x30ba2ecd5e93e76d, - 0xf502aef532322654, - 0x4b2a18ade67246b5, - 0xea9d5a1336fbc98b, - 0x86cdcc31c307e171, - 0x4bbaf5976ecfefd8, - 0xed41d05b78d6e286, - 0x10d78dd8915a171d, - 0x59049500004a4485, - 0xdfa8c93ba46d2666, - 0x7e9bd009b86a0845, - 0x400a7f755588e659, - 0x185629dcda58878c, - ]); - - /// A list of powers of two from 0 to 95. - /// - /// Note that 2^{96} = -1 mod P so all powers of two can be simply - /// derived from this list. - const POWERS_OF_TWO: [Self; 96] = { - let mut powers_of_two = [Self::ONE; 96]; - - let mut i = 1; - while i < 64 { - powers_of_two[i] = Self::new(1 << i); - i += 1; - } - let mut var = Self::new(1 << 63); - while i < 96 { - var = const_add(var, var); - powers_of_two[i] = var; - i += 1; - } - powers_of_two - }; -} - -impl PartialEq for Goldilocks { - fn eq(&self, other: &Self) -> bool { - self.as_canonical_u64() == other.as_canonical_u64() - } -} - -impl Eq for Goldilocks {} - -impl Packable for Goldilocks {} - -impl Hash for Goldilocks { - fn hash(&self, state: &mut H) { - state.write_u64(self.as_canonical_u64()); - } -} - -impl Ord for Goldilocks { - fn cmp(&self, other: &Self) -> core::cmp::Ordering { - self.as_canonical_u64().cmp(&other.as_canonical_u64()) - } -} - -impl PartialOrd for Goldilocks { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Display for Goldilocks { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - Display::fmt(&self.as_canonical_u64(), f) - } -} - -impl Debug for Goldilocks { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - Debug::fmt(&self.as_canonical_u64(), f) - } -} - -impl Distribution for StandardUniform { - fn sample(&self, rng: &mut R) -> Goldilocks { - loop { - let next_u64 = rng.next_u64(); - let is_canonical = next_u64 < Goldilocks::ORDER_U64; - if is_canonical { - return Goldilocks::new(next_u64); - } - } - } -} - -impl UniformSamplingField for Goldilocks { - const MAX_SINGLE_SAMPLE_BITS: usize = 24; - const SAMPLING_BITS_M: [u64; 64] = { - let prime: u64 = P; - let mut a = [0u64; 64]; - let mut k = 0; - while k < 64 { - if k == 0 { - a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always. - } else { - // Create a mask to zero out the last k bits - let mask = !((1u64 << k) - 1); - a[k] = prime & mask; - } - k += 1; - } - a - }; -} - -impl PrimeCharacteristicRing for Goldilocks { - type PrimeSubfield = Self; - - const ZERO: Self = Self::new(0); - const ONE: Self = Self::new(1); - const TWO: Self = Self::new(2); - const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1); - - #[inline] - fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { - f - } - - #[inline] - fn from_bool(b: bool) -> Self { - Self::new(b.into()) - } - - #[inline] - fn halve(&self) -> Self { - Self::new(halve_u64::

(self.value)) - } - - #[inline] - fn mul_2exp_u64(&self, exp: u64) -> Self { - // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P. - if exp < 96 { - *self * Self::POWERS_OF_TWO[exp as usize] - } else if exp < 192 { - -*self * Self::POWERS_OF_TWO[(exp - 96) as usize] - } else { - self.mul_2exp_u64(exp % 192) - } - } - - #[inline] - fn div_2exp_u64(&self, mut exp: u64) -> Self { - // In the goldilocks field, 2^192 = 1 mod P. - // Thus 2^{-n} = 2^{192 - n} mod P. - exp %= 192; - self.mul_2exp_u64(192 - exp) - } - - #[inline] - fn sum_array(input: &[Self]) -> Self { - assert_eq!(N, input.len()); - // Benchmarking shows that for N <= 3 it's faster to sum the elements directly - // but for N > 3 it's faster to use the .sum() methods which passes through u128's - // allowing for delayed reductions. - match N { - 0 => Self::ZERO, - 1 => input[0], - 2 => input[0] + input[1], - 3 => input[0] + input[1] + input[2], - _ => input.iter().copied().sum(), - } - } - - #[inline] - fn dot_product(lhs: &[Self; N], rhs: &[Self; N]) -> Self { - // The constant OFFSET has 2 important properties: - // 1. It is a multiple of P. - // 2. It is greater than the maximum possible value of the sum of the products of two u64s. - const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32); - assert!((N as u32) <= (1 << 31)); - match N { - 0 => Self::ZERO, - 1 => lhs[0] * rhs[0], - 2 => { - // We unroll the N = 2 case as it is slightly faster and this is an important case - // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension. - let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128); - let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128); - - // We know that long_prod_0, long_prod_1 < OFFSET. - // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET. - let (sum, over) = long_prod_0.overflowing_add(long_prod_1); - // Compiler really likes defining sum_corr here instead of in the if/else. - let sum_corr = sum.wrapping_sub(OFFSET); - if over { - reduce128(sum_corr) - } else { - reduce128(sum) - } - } - _ => { - let (lo_plus_hi, hi) = lhs - .iter() - .zip(rhs) - .map(|(x, y)| (x.value as u128) * (y.value as u128)) - .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| { - // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits. - let val_hi = (val >> 96) as u64; - // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi. - // As N <= 2^32, acc_hi cannot overflow. - unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) } - }); - // First, remove the hi part from lo_plus_hi. - let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96); - // As 2^{96} = -1 mod P, we simply need to reduce lo - hi. - // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow. - let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) }; - reduce128(sum) - } - } - } - - #[inline] - fn zero_vec(len: usize) -> Vec { - // SAFETY: - // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment - // and memory layout making `flatten_to_base` safe. This this will create - // a vector Goldilocks elements with value set to 0. - unsafe { flatten_to_base(vec![0u64; len]) } - } -} - -/// Degree of the smallest permutation polynomial for Goldilocks. -/// -/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7. -impl InjectiveMonomial<7> for Goldilocks {} - -impl PermutationMonomial<7> for Goldilocks { - /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}. - /// - /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`. - fn injective_exp_root_n(&self) -> Self { - exp_10540996611094048183(*self) - } -} - -impl RawDataSerializable for Goldilocks { - impl_raw_serializable_primefield64!(); -} - -impl Field for Goldilocks { - #[cfg(all( - target_arch = "x86_64", - target_feature = "avx2", - not(target_feature = "avx512f") - ))] - type Packing = crate::PackedGoldilocksAVX2; - - #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] - type Packing = crate::PackedGoldilocksAVX512; - - // PATCHED for bench_vs_plonky3: disable NEON packing for apples-to-apples - // scalar comparison against Lambda STARK. Upstream: `crate::PackedGoldilocksNeon`. - #[cfg(target_arch = "aarch64")] - type Packing = Self; - - #[cfg(not(any( - all( - target_arch = "x86_64", - target_feature = "avx2", - not(target_feature = "avx512f") - ), - all(target_arch = "x86_64", target_feature = "avx512f"), - target_arch = "aarch64", - )))] - type Packing = Self; - - // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator() - const GENERATOR: Self = Self::new(7); - - fn is_zero(&self) -> bool { - self.value == 0 || self.value == Self::ORDER_U64 - } - - fn try_inverse(&self) -> Option { - if self.is_zero() { - return None; - } - - Some(gcd_inversion(*self)) - } - - #[inline] - fn order() -> BigUint { - P.into() - } -} - -// We use macros to implement QuotientMap for all integer types except for u64 and i64. -quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]); -quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]); -quotient_map_large_uint!( - Goldilocks, - u64, - Goldilocks::ORDER_U64, - "`[0, 2^64 - 2^32]`", - "`[0, 2^64 - 1]`", - [u128] -); -quotient_map_large_iint!( - Goldilocks, - i64, - "`[-(2^63 - 2^31), 2^63 - 2^31]`", - "`[1 + 2^32 - 2^64, 2^64 - 1]`", - [(i128, u128)] -); - -impl QuotientMap for Goldilocks { - /// Convert a given `u64` integer into an element of the `Goldilocks` field. - /// - /// No reduction is needed as the internal value is allowed - /// to be any u64. - #[inline] - fn from_int(int: u64) -> Self { - Self::new(int) - } - - /// Convert a given `u64` integer into an element of the `Goldilocks` field. - /// - /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`. - #[inline] - fn from_canonical_checked(int: u64) -> Option { - (int < Self::ORDER_U64).then(|| Self::new(int)) - } - - /// Convert a given `u64` integer into an element of the `Goldilocks` field. - /// - /// # Safety - /// In this case this function is actually always safe as the internal - /// value is allowed to be any u64. - #[inline(always)] - unsafe fn from_canonical_unchecked(int: u64) -> Self { - Self::new(int) - } -} - -impl QuotientMap for Goldilocks { - /// Convert a given `i64` integer into an element of the `Goldilocks` field. - /// - /// We simply need to deal with the sign. - #[inline] - fn from_int(int: i64) -> Self { - if int >= 0 { - Self::new(int as u64) - } else { - Self::new(Self::ORDER_U64.wrapping_add_signed(int)) - } - } - - /// Convert a given `i64` integer into an element of the `Goldilocks` field. - /// - /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`. - #[inline] - fn from_canonical_checked(int: i64) -> Option { - const POS_BOUND: i64 = (P >> 1) as i64; - const NEG_BOUND: i64 = -POS_BOUND; - match int { - 0..=POS_BOUND => Some(Self::new(int as u64)), - NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))), - _ => None, - } - } - - /// Convert a given `i64` integer into an element of the `Goldilocks` field. - /// - /// # Safety - /// In this case this function is actually always safe as the internal - /// value is allowed to be any u64. - #[inline(always)] - unsafe fn from_canonical_unchecked(int: i64) -> Self { - Self::from_int(int) - } -} - -impl PrimeField for Goldilocks { - fn as_canonical_biguint(&self) -> BigUint { - self.as_canonical_u64().into() - } -} - -impl PrimeField64 for Goldilocks { - const ORDER_U64: u64 = P; - - #[inline] - fn as_canonical_u64(&self) -> u64 { - let mut c = self.value; - // We only need one condition subtraction, since 2 * ORDER would not fit in a u64. - if c >= Self::ORDER_U64 { - c -= Self::ORDER_U64; - } - c - } -} - -impl TwoAdicField for Goldilocks { - const TWO_ADICITY: usize = 32; - - fn two_adic_generator(bits: usize) -> Self { - assert!(bits <= Self::TWO_ADICITY); - Self::TWO_ADIC_GENERATORS[bits] - } -} - -/// A const version of the addition function. -/// -/// Useful for constructing constants values in const contexts. Outside of -/// const contexts, Add should be used instead. -#[inline] -const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks { - let (sum, over) = lhs.value.overflowing_add(rhs.value); - let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER); - if over { - sum += Goldilocks::NEG_ORDER; - } - Goldilocks::new(sum) -} - -impl Add for Goldilocks { - type Output = Self; - - #[inline] - fn add(self, rhs: Self) -> Self { - let (sum, over) = self.value.overflowing_add(rhs.value); - let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER); - if over { - // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not - // sufficient for double-overflow. - // This assume does two things: - // 1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip - // this check. - // 2. Hints to the compiler how rare this double-overflow is (thus handled better with - // a branch). - unsafe { - assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64); - } - branch_hint(); - sum += Self::NEG_ORDER; // Cannot overflow. - } - Self::new(sum) - } -} - -impl Sub for Goldilocks { - type Output = Self; - - #[inline] - fn sub(self, rhs: Self) -> Self { - let (diff, under) = self.value.overflowing_sub(rhs.value); - let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER); - if under { - // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not - // sufficient for double-underflow. - // This assume does two things: - // 1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER, - // then it can skip this check. - // 2. Hints to the compiler how rare this double-underflow is (thus handled better - // with a branch). - unsafe { - assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64); - } - branch_hint(); - diff -= Self::NEG_ORDER; // Cannot underflow. - } - Self::new(diff) - } -} - -impl Neg for Goldilocks { - type Output = Self; - - #[inline] - fn neg(self) -> Self::Output { - Self::new(Self::ORDER_U64 - self.as_canonical_u64()) - } -} - -impl Mul for Goldilocks { - type Output = Self; - - #[inline] - fn mul(self, rhs: Self) -> Self { - reduce128(u128::from(self.value) * u128::from(rhs.value)) - } -} - -impl_add_assign!(Goldilocks); -impl_sub_assign!(Goldilocks); -impl_mul_methods!(Goldilocks); -impl_div_methods!(Goldilocks, Goldilocks); - -impl Sum for Goldilocks { - fn sum>(iter: I) -> Self { - // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2. - - // This sum will not overflow so long as iter.len() < 2^64. - let sum = iter.map(|x| x.value as u128).sum::(); - reduce128(sum) - } -} - -/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the -/// field order and `2^64`. -#[inline] -pub(crate) fn reduce128(x: u128) -> Goldilocks { - let (x_lo, x_hi) = split(x); // This is a no-op - let x_hi_hi = x_hi >> 32; - let x_hi_lo = x_hi & Goldilocks::NEG_ORDER; - - let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi); - if borrow { - branch_hint(); // A borrow is exceedingly rare. It is faster to branch. - t0 -= Goldilocks::NEG_ORDER; // Cannot underflow. - } - let t1 = x_hi_lo * Goldilocks::NEG_ORDER; - let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) }; - Goldilocks::new(t2) -} - -#[inline] -#[allow(clippy::cast_possible_truncation)] -const fn split(x: u128) -> (u64, u64) { - (x as u64, (x >> 64) as u64) -} - -/// Fast addition modulo ORDER for x86-64. -/// This function is marked unsafe for the following reasons: -/// - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001. -/// - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in -/// the registers, so its use is not recommended when either input will be used again. -#[inline(always)] -#[cfg(target_arch = "x86_64")] -unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 { - unsafe { - let res_wrapped: u64; - let adjustment: u64; - core::arch::asm!( - "add {0}, {1}", - // Trick. The carry flag is set iff the addition overflowed. - // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does - // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of - // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1} - // on overflow; this happens be NEG_ORDER. - // Note that the CPU does not realize that the result of sbb x, x does not actually depend - // on x. We must write the result to a register that we know to be ready. We have a - // dependency on {1} anyway, so let's use it. - "sbb {1:e}, {1:e}", - inlateout(reg) x => res_wrapped, - inlateout(reg) y => adjustment, - options(pure, nomem, nostack), - ); - assume(x != 0 || (res_wrapped == y && adjustment == 0)); - assume(y != 0 || (res_wrapped == x && adjustment == 0)); - // Add NEG_ORDER == subtract ORDER. - // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect. - res_wrapped + adjustment - } -} - -#[inline(always)] -#[cfg(not(target_arch = "x86_64"))] -unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 { - let (res_wrapped, carry) = x.overflowing_add(y); - // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect. - res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry) -} - -/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm. -/// -/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf. -/// The key idea is to compute update factors which are incorrect by a known power of 2 which -/// can be corrected at the end. These update factors can then be used to construct the inverse -/// via a simple linear combination. -/// -/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic. -fn gcd_inversion(input: Goldilocks) -> Goldilocks { - // Initialise our values to the value we want to invert and the prime. - let (mut a, mut b) = (input.value, P); - - // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`. - // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`. - // We split the iterations into 2 rounds of length 63. - const ROUND_SIZE: usize = 63; - - // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted - // version which doesn't do any computations involving g. But either the compiler works this out - // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks. - let (f00, _, f10, _) = gcd_inner::(&mut a, &mut b); - let (_, _, f11, g11) = gcd_inner::(&mut a, &mut b); - - // The update factors are i64's except we need to interpret -2^63 as 2^63. - // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`. - let u = from_unusual_int(f00); - let v = from_unusual_int(f10); - let u_fac11 = from_unusual_int(f11); - let v_fac11 = from_unusual_int(g11); - - // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}. - // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66. - (u * u_fac11 + v * v_fac11).mul_2exp_u64(66) -} - -/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63. -const fn from_unusual_int(int: i64) -> Goldilocks { - if (int >= 0) || (int == i64::MIN) { - Goldilocks::new(int as u64) - } else { - Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int)) - } -} - -#[cfg(test)] -mod tests { - use p3_field::extension::BinomialExtensionField; - use p3_field_testing::{ - test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field, - }; - - use super::*; - - type F = Goldilocks; - type EF = BinomialExtensionField; - - #[test] - fn test_goldilocks() { - let f = F::new(100); - assert_eq!(f.as_canonical_u64(), 100); - - // Over the Goldilocks field, the following set of equations hold - // p = 0 - // 2^64 - 2^32 + 1 = 0 - // 2^64 = 2^32 - 1 - let f = F::new(u64::MAX); - assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1); - - let f = F::from_u64(u64::MAX); - assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1); - - // Generator check - let expected_multiplicative_group_generator = F::new(7); - assert_eq!(F::GENERATOR, expected_multiplicative_group_generator); - assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64); - - // Check on `reduce_u128` - let x = u128::MAX; - let y = reduce128(x); - // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1 - // 2^128 - 1 = (2^64 - 1) * (2^64 + 1) - // = (2^32 - 1 - 1) * (2^32 - 1 + 1) - // = (2^32 - 2) * (2^32) - // = 2^64 - 2 * 2^32 - // = 2^64 - 2^33 - // = 2^32 - 1 - 2^33 - // = - 2^32 - 1 - let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE; - assert_eq!(y, expected_result); - - let f = F::new(100); - assert_eq!(f.injective_exp_n().injective_exp_root_n(), f); - assert_eq!(y.injective_exp_n().injective_exp_root_n(), y); - assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO); - } - - // Goldilocks has a redundant representation for both 0 and 1. - const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)]; - const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)]; - - // Get the prime factorization of the order of the multiplicative group. - // i.e. the prime factorization of P - 1. - fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] { - [ - (BigUint::from(2u8), 32), - (BigUint::from(3u8), 1), - (BigUint::from(5u8), 1), - (BigUint::from(17u8), 1), - (BigUint::from(257u16), 1), - (BigUint::from(65537u32), 1), - ] - } - - test_field!( - crate::Goldilocks, - &super::ZEROS, - &super::ONES, - &super::multiplicative_group_prime_factorization() - ); - test_prime_field!(crate::Goldilocks); - test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES); - test_two_adic_field!(crate::Goldilocks); - - test_field_dft!( - radix2dit, - crate::Goldilocks, - super::EF, - p3_dft::Radix2Dit<_> - ); - test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers); - test_field_dft!( - parallel, - crate::Goldilocks, - super::EF, - p3_dft::Radix2DitParallel - ); -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs deleted file mode 100644 index 9447fe094..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/lib.rs +++ /dev/null @@ -1,42 +0,0 @@ -//! The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`. - -#![no_std] - -extern crate alloc; - -mod extension; -mod goldilocks; -mod mds; -mod poseidon2; - -pub use goldilocks::*; -pub use mds::*; -pub use poseidon2::*; - -pub mod poseidon1; - -#[cfg(target_arch = "aarch64")] -mod aarch64_neon; - -#[cfg(target_arch = "aarch64")] -pub use aarch64_neon::*; - -#[cfg(all( - target_arch = "x86_64", - target_feature = "avx2", - not(target_feature = "avx512f") -))] -mod x86_64_avx2; - -#[cfg(all( - target_arch = "x86_64", - target_feature = "avx2", - not(target_feature = "avx512f") -))] -pub use x86_64_avx2::*; - -#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] -mod x86_64_avx512; - -#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] -pub use x86_64_avx512::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs deleted file mode 100644 index df41485b3..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/mds.rs +++ /dev/null @@ -1,761 +0,0 @@ -//! MDS matrices over the Goldilocks field, and permutations defined by them. -//! -//! NB: Not all sizes have fast implementations of their permutations. -//! Supported sizes: 8, 12, 16, 24, 32, 64, 68. -//! Sizes 8 and 12 are from Plonky2, size 16 was found as part of concurrent -//! work by Angus Gruen and Hamish Ivey-Law. Other sizes are from Ulrich Haböck's -//! database. - -use p3_dft::Radix2Bowers; -use p3_mds::MdsPermutation; -use p3_mds::karatsuba_convolution::Convolve; -use p3_mds::util::{apply_circulant, apply_circulant_fft, first_row_to_first_col}; -use p3_symmetric::Permutation; - -use crate::{Goldilocks, reduce128}; - -#[derive(Clone, Debug, Default)] -pub struct MdsMatrixGoldilocks; - -/// Instantiate convolution for "small" RHS vectors over Goldilocks. -/// -/// Here "small" means N = len(rhs) <= 16 and sum(r for r in rhs) < -/// 2^51, though in practice the sum will be less than 2^9. -#[derive(Debug)] -pub struct SmallConvolveGoldilocks; -impl Convolve for SmallConvolveGoldilocks { - const T_ZERO: i128 = 0; - const U_ZERO: i64 = 0; - - #[inline(always)] - fn halve(val: i128) -> i128 { - val >> 1 - } - - /// Return the lift of a Goldilocks element, 0 <= input.value <= P - /// < 2^64. We widen immediately, since some valid Goldilocks elements - /// don't fit in an i64, and since in any case overflow can occur - /// for even the smallest convolutions. - #[inline(always)] - fn read(input: Goldilocks) -> i128 { - input.value as i128 - } - - /// For a convolution of size N, |x| < N * 2^64 and (as per the - /// assumption above), |y| < 2^51. So the product is at most N * - /// 2^115 which will not overflow for N <= 16. We widen `y` at - /// this point to perform the multiplication. - #[inline(always)] - fn parity_dot(u: [i128; N], v: [i64; N]) -> i128 { - let mut s = 0i128; - for i in 0..N { - s += u[i] * v[i] as i128; - } - s - } - - /// The assumptions above mean z < N^2 * 2^115, which is at most - /// 2^123 when N <= 16. - /// - /// NB: Even though intermediate values could be negative, the - /// output must be non-negative since the inputs were - /// non-negative. - #[inline(always)] - fn reduce(z: i128) -> Goldilocks { - debug_assert!(z >= 0); - reduce128(z as u128) - } -} - -const FFT_ALGO: Radix2Bowers = Radix2Bowers; - -pub(crate) const MATRIX_CIRC_MDS_8_SML_ROW: [i64; 8] = [7, 1, 3, 8, 8, 3, 4, 9]; - -/// First column of the circulant MDS matrix for width 8, derived from the first row. -pub const MATRIX_CIRC_MDS_8_COL: [i64; 8] = first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW); - -impl Permutation<[Goldilocks; 8]> for MdsMatrixGoldilocks { - fn permute(&self, input: [Goldilocks; 8]) -> [Goldilocks; 8] { - const MATRIX_CIRC_MDS_8_SML_COL: [i64; 8] = - first_row_to_first_col(&MATRIX_CIRC_MDS_8_SML_ROW); - SmallConvolveGoldilocks::apply( - input, - MATRIX_CIRC_MDS_8_SML_COL, - SmallConvolveGoldilocks::conv8, - ) - } -} -impl MdsPermutation for MdsMatrixGoldilocks {} - -pub(crate) const MATRIX_CIRC_MDS_12_SML_ROW: [i64; 12] = [1, 1, 2, 1, 8, 9, 10, 7, 5, 9, 4, 10]; - -/// First column of the circulant MDS matrix for width 12, derived from the first row. -pub const MATRIX_CIRC_MDS_12_COL: [i64; 12] = first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW); - -impl Permutation<[Goldilocks; 12]> for MdsMatrixGoldilocks { - fn permute(&self, input: [Goldilocks; 12]) -> [Goldilocks; 12] { - const MATRIX_CIRC_MDS_12_SML_COL: [i64; 12] = - first_row_to_first_col(&MATRIX_CIRC_MDS_12_SML_ROW); - SmallConvolveGoldilocks::apply( - input, - MATRIX_CIRC_MDS_12_SML_COL, - SmallConvolveGoldilocks::conv12, - ) - } -} -impl MdsPermutation for MdsMatrixGoldilocks {} - -pub(crate) const MATRIX_CIRC_MDS_16_SML_ROW: [i64; 16] = - [1, 1, 51, 1, 11, 17, 2, 1, 101, 63, 15, 2, 67, 22, 13, 3]; - -impl Permutation<[Goldilocks; 16]> for MdsMatrixGoldilocks { - fn permute(&self, input: [Goldilocks; 16]) -> [Goldilocks; 16] { - const MATRIX_CIRC_MDS_16_SML_COL: [i64; 16] = - first_row_to_first_col(&MATRIX_CIRC_MDS_16_SML_ROW); - SmallConvolveGoldilocks::apply( - input, - MATRIX_CIRC_MDS_16_SML_COL, - SmallConvolveGoldilocks::conv16, - ) - } -} -impl MdsPermutation for MdsMatrixGoldilocks {} - -#[rustfmt::skip] -pub(crate) const MATRIX_CIRC_MDS_24_GOLDILOCKS: [u64; 24] = [ - 0x5FFFFFFFA00AAAAB, 0x24021AB75BBFE656, 0x7BE9082D73B06DF5, 0x2282863E9C3A5A62, - 0xE0071C70DFFC71C8, 0x796CB65AB42A1A63, 0xDBBBBFFADFFDDDE3, 0x23B88EE217C5C9C2, - 0x20030C309FFB6DB7, 0x23C3C64763BE1E1D, 0x0F93B7C9CC51362E, 0xC697A1094BD0850A, - 0xDFFFFFFF1FFC71C8, 0xC15A4FD614950302, 0xC41D883A4C4DEDF2, 0x187879BC23C46462, - 0x5FFCF3CEDFFE79E8, 0x1C41DF105B82398E, 0x64444003DFFDDDDA, 0x76EDDBB6F7E51F95, - 0x1FF8E38E20038E39, 0x214139BD5C40A09D, 0x3065B7CCF3B3B621, 0x23B6F4622485CEDC, -]; - -impl Permutation<[Goldilocks; 24]> for MdsMatrixGoldilocks { - fn permute(&self, input: [Goldilocks; 24]) -> [Goldilocks; 24] { - apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input) - } -} -impl MdsPermutation for MdsMatrixGoldilocks {} - -#[rustfmt::skip] -const MATRIX_CIRC_MDS_32_GOLDILOCKS: [u64; 32] = [ - 0x0800000000000000, 0x69249248B4924925, 0x3ABD5EAF15EAF57B, 0x294A5294739CE73A, - 0x59E2D2CEB4B3C5A6, 0x087FBE00FF7C0220, 0xA554AA94A554AA96, 0xF00080FEFFDF8005, - 0x64CCCCCC6666699A, 0x5B13AD8973B139D9, 0xAD4A55ACA54AD5AA, 0xDA496DA3B492DB8A, - 0x4AD696955A5694B5, 0xA4A6B29A25B496D3, 0xA74EA162162BD3A9, 0xC698B3A5662CE98C, - 0xA7FFFFFF55555556, 0x4AAAAAAA5AAAAAAB, 0xB047DC113DC11F71, 0x8BA2E8B99B26C9B3, - 0xD259696C5A5B4D2E, 0xA7D540AA557EA9F6, 0x8B6E922D26DB249C, 0xFAAA805455602AAD, - 0xCB33333266666334, 0xD13B17619B13B277, 0x45B26D9326E9374A, 0x52AB552A5AA9556B, - 0x68ED2D2DB4B87697, 0x8B264C98A74E9D3B, 0x09EC23D83D847B09, 0x2C9A4D26669349A5, -]; - -impl Permutation<[Goldilocks; 32]> for MdsMatrixGoldilocks { - fn permute(&self, input: [Goldilocks; 32]) -> [Goldilocks; 32] { - const ENTRIES: [u64; 32] = first_row_to_first_col(&MATRIX_CIRC_MDS_32_GOLDILOCKS); - apply_circulant_fft(&FFT_ALGO, ENTRIES, &input) - } -} -impl MdsPermutation for MdsMatrixGoldilocks {} - -#[rustfmt::skip] -const MATRIX_CIRC_MDS_64_GOLDILOCKS: [u64; 64] = [ - 0x07FFFFFFFC000000, 0xFBFFFFFF04000001, 0x436DB6DB25B6DB6E, 0x4AAAAAAA5AAAAAAB, - 0x45B2D96C6D96CB66, 0x3BC7BC7B87BC7BC8, 0x6318C63125294A53, 0xCB3672CCCD9CB368, - 0xB43CB5A12D68796C, 0xFBFBFBFAFBFBFBFD, 0x883DBF107B7E2210, 0x8A7689B59B629DA3, - 0xF7FEFFDF00000001, 0x7B7C83BBC83BC47C, 0xEFF0410107EF7F83, 0x2CD8B3629CB272CA, - 0x9800019900CCCE67, 0xFBFFFBFF07FFFC01, 0x94EC4A758C4EC628, 0xDA5A5B4A6D2D2E1F, - 0xFFEFC080FC003FFF, 0xBC387BC2C783BC79, 0xB492DB686D24B6F3, 0x1DB6925B4B6E2477, - 0x7801E0EF87BFFF10, 0xFC0803FAFBFC0409, 0x3780FE03C086F21C, 0x8B749B224DB22D94, - 0x32648B36B76E9923, 0x3BC3C3C387C3C3C4, 0x79AF286B4FCA1AF3, 0x9E2762758B627628, - 0x52AAAAAA56AAAAAB, 0xFBFFFFFEFC000001, 0xF7FFFFFF08000001, 0x2CCCCCCC9CCCCCCD, - 0xCF286BC946BCA1B0, 0xBC483B7B883B7C49, 0xD9364D9287C1F07D, 0xAD5A94A8A95AD5AA, - 0xFF871002C400F1E1, 0xFC03FC02FC03FC05, 0xD29495A4D6D4B4A6, 0x6C926DD1DD24DB65, - 0x1EDC247B4DB64937, 0x7C7B843B47BC437D, 0xA55A95AAAD5AD52C, 0x4A96D5A45AD694A6, - 0xFE6664CBCD999801, 0xFC0003FF08000401, 0x1EC4F09D64EC4D8A, 0x9E1E1D2C8B4B4A5B, - 0xD9270937709B64DC, 0x3BB77C4448843B78, 0xFFFFFFDF03FF0021, 0x59D8761D2D8A6299, - 0xC3496878A5E5A4B5, 0xFBF80402FC0403F9, 0x5ECD9B360E142851, 0x6D925D6429D64976, - 0xA8AE615C19CC2B99, 0xBC44444388444445, 0xDFE3F1F81CFC7E40, 0xDA4924916D24924A, -]; - -impl Permutation<[Goldilocks; 64]> for MdsMatrixGoldilocks { - fn permute(&self, input: [Goldilocks; 64]) -> [Goldilocks; 64] { - const ENTRIES: [u64; 64] = first_row_to_first_col(&MATRIX_CIRC_MDS_64_GOLDILOCKS); - apply_circulant_fft(&FFT_ALGO, ENTRIES, &input) - } -} -impl MdsPermutation for MdsMatrixGoldilocks {} - -#[rustfmt::skip] -const MATRIX_CIRC_MDS_68_GOLDILOCKS: [u64; 68] = [ - 0x03C3C3C3FC3C3C3C, 0x6799AFC54A69BC7D, 0xDA8C2C496A74B03B, 0x1E641D7AB35ED229, - 0x9239DA20DA3A2686, 0x6E23D41459EBA8C4, 0x7BC412896E2A6B3A, 0x9082059089ABD4FC, - 0x94A16FA8B0339EEE, 0x85650EC91BB519C9, 0x1600745267E94DE1, 0xFFFD8405C82020AB, - 0x21BDE80429DCED6A, 0x8ACE123AF754E343, 0xFFC7211605D2BDAE, 0xC21187AE15900F4D, - 0x9C4A889708568DC6, 0x65A5A726B5758D8E, 0x949DB90B9AC0D11A, 0x23B6CF7C368BBE52, - 0xD5128DDF59CB5A35, 0xF53BCC5BDADF3A0A, 0xBA7C5112F4BAB1CD, 0x4B93989C5B729351, - 0x6534B7E50E4AD1CB, 0x640061B54C918405, 0x0E66E1F90D2C9311, 0x31C8649B0FE7557F, - 0x0E9190D165F4A8F3, 0x52DF336BB708F919, 0x3C0F6697F14065A5, 0xBE8190942EC50031, - 0x60038E9ACC701118, 0x73F105909A55A88B, 0xFEBEBEBDABEBEBED, 0x6F52163A64B03467, - 0xFBAE131F23A12F56, 0x1950493BC70D0676, 0x2886550DB5A1BBBF, 0x15B003D6E58181D7, - 0x3A4E7D9D44F100F8, 0x6CC3AB896025E6A0, 0x7E23E68456F825E5, 0x079CDD570B591A16, - 0xEC15A830C3D2CCD1, 0xCF4C722D2C0F8A0E, 0xC1BB6F5591B59A26, 0xB63A5931A607BDE0, - 0x43A0AD0B71040187, 0x7E4B492889D1CEE0, 0x734153F3F0C31C5B, 0x98D8D756B2725A5B, - 0x5589D20D74BA00B8, 0xB2DF58DF0A312509, 0xFABC378690D64A3A, 0x700640AFC244B695, - 0xFFA652236547F3BE, 0x2B9CA498A001D059, 0x7DACA6F16787D5DE, 0xAAAD774FAC613EA3, - 0xA88583816975CD56, 0x78B71DC516FF49CA, 0xC7BF095DF702FFA6, 0x78A60B3F971783B3, - 0xCB158EF40BC75CAC, 0xA97E818DBC152B4C, 0x9FC8339D415C3999, 0x006A88C0A0D8201C, -]; - -impl Permutation<[Goldilocks; 68]> for MdsMatrixGoldilocks { - fn permute(&self, input: [Goldilocks; 68]) -> [Goldilocks; 68] { - apply_circulant(&MATRIX_CIRC_MDS_68_GOLDILOCKS, &input) - } -} -impl MdsPermutation for MdsMatrixGoldilocks {} - -#[cfg(test)] -mod tests { - use p3_symmetric::Permutation; - - use super::{Goldilocks, MdsMatrixGoldilocks}; - - #[test] - fn goldilocks8() { - let input: [Goldilocks; 8] = Goldilocks::new_array([ - 2434589605738284713, - 4817685620989478889, - 13397079175138649456, - 11944520631108649751, - 1033251468644039632, - 3092099742268329866, - 7160548811622790454, - 9959569614427134344, - ]); - - let output = MdsMatrixGoldilocks.permute(input); - - let expected: [Goldilocks; 8] = Goldilocks::new_array([ - 16726687146516531007, - 14721040752765534861, - 15566838577475948790, - 9095485010737904250, - 11353934351835864222, - 11056556168691087893, - 4199602889124860181, - 315643510993921470, - ]); - - assert_eq!(output, expected); - } - - #[test] - fn goldilocks12() { - let input: [Goldilocks; 12] = Goldilocks::new_array([ - 14847187883725400244, - 969392934980971521, - 6996647758016470432, - 4674844440624672154, - 264841656685969785, - 1246852265697711623, - 18223868478428473484, - 12122736699239070772, - 11263701854732819430, - 12739925508864285577, - 11648637570857932167, - 14090978315217600393, - ]); - - let output = MdsMatrixGoldilocks.permute(input); - - let expected: [Goldilocks; 12] = Goldilocks::new_array([ - 9322351889214742299, - 8700136572060418355, - 4881757876459003977, - 9899544690241851021, - 480548822895830465, - 5445915149371405525, - 14955363277757168581, - 6672733082273363313, - 190938676320003294, - 1613225933948270736, - 3549006224849989171, - 12169032187873197425, - ]); - - assert_eq!(output, expected); - } - - #[test] - fn goldilocks16() { - let input: [Goldilocks; 16] = Goldilocks::new_array([ - 13216135600341032847, - 15626390207663319651, - 2052474569300149934, - 4375663431730581786, - 16596827905941257435, - 10019626608444427271, - 7831946179065963230, - 17104499871144693506, - 9021930732511690478, - 6899419210615882449, - 8131182521761419514, - 432489675596019804, - 8508050013409958723, - 14134506582804571789, - 13283546413390931641, - 14711125975653831032, - ]); - - let output = MdsMatrixGoldilocks.permute(input); - - let expected: [Goldilocks; 16] = Goldilocks::new_array([ - 9484392671298797780, - 149770626972189150, - 12125722600598304117, - 15945232149672903756, - 13199929870021500593, - 18443980893262804946, - 317150800081307627, - 16910019239751125049, - 1996802739033818490, - 11668458913264624237, - 11078800762167869397, - 13758408662406282356, - 11119677412113674380, - 7344117715971661026, - 4202436890275702092, - 681166793519210465, - ]); - - assert_eq!(output, expected); - } - - #[test] - fn goldilocks24() { - let input: [Goldilocks; 24] = Goldilocks::new_array([ - 11426771245122339662, - 5975488243963332229, - 11441424994503305651, - 5755561333702259678, - 7295454168648181339, - 16724279929816174064, - 32359231037136391, - 3713621595270370753, - 8421765959140936778, - 12370571593326246544, - 8633733294559731287, - 12765436832373161027, - 15606692828890413034, - 8068160018166226874, - 10719661629577139538, - 13036735610140127982, - 10213543772818211674, - 8041886705706266368, - 12022983417703446028, - 4179370708601587579, - 11125302089484330465, - 9904943018174649533, - 16178194376951442671, - 1545799842160818502, - ]); - - let output = MdsMatrixGoldilocks.permute(input); - - let expected: [Goldilocks; 24] = Goldilocks::new_array([ - 18431075688485197060, - 14823984346528185622, - 7262979358411339215, - 14816911393874702213, - 6721523710303409972, - 10829861327716364029, - 2456948878733883601, - 11088379938350287658, - 3820735023521527858, - 9062288923770492958, - 5159244568306327366, - 1401669669887165869, - 11908734248351870182, - 10640195377186320543, - 6552733980894593378, - 17103376282032495459, - 5204287788603805758, - 17783185518697631139, - 9006863878586007300, - 11122535637762904803, - 5271621316102699962, - 9734499541452484536, - 11778274360927642637, - 3217831681350496533, - ]); - - assert_eq!(output, expected); - } - - #[test] - fn goldilocks32() { - let input: [Goldilocks; 32] = Goldilocks::new_array([ - 8401806579759049284, - 14709608922272986544, - 8130995604641968478, - 7833133203357642391, - 10700492548100684406, - 3941105252506602047, - 8122370916776133262, - 15079919378435648206, - 8774521769784086994, - 16794844316583392853, - 9356562741425567167, - 13317198313361936216, - 7187680218428599522, - 16525662096158660997, - 540453741156061014, - 16543585577270698663, - 3802215918136285729, - 11389297895303247764, - 5133769394766075512, - 1057795099426170863, - 18037861421172314665, - 17632255188776359310, - 17616515088477043142, - 13307921676744533876, - 17602277262015191215, - 15819040654617566738, - 11961318546000835928, - 15593174310433874065, - 9152657050882549004, - 4801868480369948110, - 13202076339494141066, - 726396847460932316, - ]); - - let output = MdsMatrixGoldilocks.permute(input); - - let expected: [Goldilocks; 32] = Goldilocks::new_array([ - 1179701925859507209, - 5543239597787055637, - 5978278622530964070, - 3622388166841103287, - 11383243182536830899, - 14719109850604985734, - 17672601866826623850, - 4879627080283827596, - 7556887460241466109, - 9548493506061808122, - 13980851986825291174, - 2029844508485082398, - 10375517623784134775, - 13067093881736606569, - 6446569064196467795, - 15375603814779462714, - 11307946648742033371, - 1593906954637160608, - 5776169226282316678, - 8167048017892669861, - 3954052226208277367, - 9346878497567392707, - 5570872870988220142, - 10792661164389799960, - 17494962593174487938, - 7080549557843445752, - 14059834522311268132, - 17747288366997773235, - 17158122400620315305, - 6816598002359267850, - 12363049840026116993, - 13313901185845854868, - ]); - - assert_eq!(output, expected); - } - - #[test] - fn goldilocks64() { - let input: [Goldilocks; 64] = Goldilocks::new_array([ - 3471075506106776899, - 4817046918282259009, - 3480368692354016145, - 18110937755057600106, - 3130862083451221140, - 15376650156021437015, - 7997596749112997445, - 7742916918728590149, - 421644639408377358, - 2491271421424548020, - 1940196613872160755, - 7152053147988203177, - 13697425352450853423, - 15877844788345672674, - 17787098720906653510, - 6857627524724866519, - 8541180216786820396, - 10769715704553877654, - 9265712399189924160, - 10220120296438955872, - 18201417281995610945, - 6749698931189855822, - 13700000989116811950, - 13205437213697578097, - 10514342943989454609, - 9926015350795325725, - 2289808224483690257, - 12598806357998460973, - 14393945610969324307, - 4744625557965362093, - 2270701163031951561, - 2927942398784334090, - 5250916386894733430, - 4030189910566345872, - 4953663590324639075, - 1241519685782896035, - 8681312160951359069, - 8236353015475387411, - 4972690458759871996, - 1396852754187463352, - 17512022752774329733, - 14009268822557836700, - 1346736409027879377, - 7609463340861239931, - 10701512803758419515, - 5067199073587389986, - 5030018986055211116, - 17692625804700013551, - 9992938630604785132, - 15350127009762647067, - 10247405821493235386, - 15172888833500531069, - 14657693742399622179, - 7391511805216089127, - 2035742693690795598, - 4047216012963057952, - 12602085105939403203, - 16985723692990258059, - 12141021186082151434, - 3174646196626212833, - 16484520987666295947, - 10579720164460442970, - 9596917135039689219, - 13761818390665814258, - ]); - - let output = MdsMatrixGoldilocks.permute(input); - - let expected: [Goldilocks; 64] = Goldilocks::new_array([ - 9158798369861934356, - 9224859686427886689, - 16948559910286211274, - 15765762765140902574, - 16202509467561200764, - 1911749439284071529, - 4607026757869726805, - 8473827004973131317, - 13716800466551879373, - 6670177022201597800, - 17416833238376299449, - 14953676562252669578, - 5828107070718286209, - 17980287408679531241, - 2220583438808757820, - 14564318040622847100, - 3950519594558514416, - 12164610170526828198, - 457385640833960098, - 14068973922383216628, - 9614382247226943793, - 3932756878771319222, - 12728498054939249570, - 9435109056498897661, - 7283114805836756402, - 1720178259138435097, - 11496602000538177285, - 7736206812858942065, - 14289784438950643645, - 12052665489155550962, - 12918409840610303255, - 5224324424989208352, - 7826309014606327907, - 11657314889847733528, - 13899641072303006348, - 7501780959676548477, - 1064261716045449147, - 1487682458939665452, - 10894217148983862136, - 12785338167343566981, - 8043323074629160032, - 10852328074701301213, - 15029722608724150267, - 2611937278660861263, - 13995790409949796943, - 7103138700054564899, - 12756778219044204581, - 4147399997707606088, - 11930966590061754579, - 16708700985380478903, - 2370160521342035603, - 14893791582608133454, - 15313288276425450946, - 16224601303711716386, - 4488931442519177087, - 7443169181907410918, - 12381442753785370161, - 16366345507676500076, - 8097905256807642731, - 8504207502183388457, - 11400931328719780407, - 10879211614969476303, - 7265889003783205111, - 7322738272300165489, - ]); - - assert_eq!(output, expected); - } - - #[test] - fn goldilocks68() { - let input: [Goldilocks; 68] = Goldilocks::new_array([ - 16450563043143968653, - 3688080826640678185, - 133253417037384537, - 17501558583799613353, - 14920674569425704293, - 5030578721963251055, - 9795600398273758687, - 402012644192671817, - 10657312189068414445, - 9508835336085746575, - 16081669758721272608, - 2072823794278273547, - 16831381326702573736, - 11381683312293543190, - 5679539322738625588, - 9346499485038639332, - 15554202803455984983, - 18373955571490331663, - 11323895584334729789, - 16834542679468148445, - 14751528164286075953, - 3755158780970327991, - 12622814707645103582, - 10329238611694882547, - 7642766530280843057, - 4876120096290984742, - 412912224820604426, - 9118233770240274553, - 3626520971021993076, - 10841049054903806738, - 18205546599950141835, - 7198482606375262809, - 17183313930831625294, - 10181033256431249241, - 1061211413812819905, - 3980261141891682525, - 5674176959446948353, - 6062696542969845681, - 3383081006315025715, - 8812665902421024067, - 3093645099818246186, - 16178737149039707082, - 8204245222345541411, - 11072582337937050490, - 17969785901925882398, - 4670890092981706609, - 12537558683977529426, - 12084598516323376868, - 16293685096019175644, - 10117612240421467846, - 17873102395739074620, - 11220493906741851877, - 4632957003022201019, - 12934229307704669322, - 2152792796882257594, - 12521131928134126701, - 17472006670677761650, - 4560570065837283016, - 6315543803073912887, - 4098689719955359793, - 1784883877365258237, - 6837590090927294950, - 2391417016765166652, - 16389291664603960875, - 12285946887702044436, - 7231705445010258971, - 12976071926225281356, - 8829402645443096358, - ]); - - let output = MdsMatrixGoldilocks.permute(input); - - let expected: [Goldilocks; 68] = Goldilocks::new_array([ - 4984914285749049383, - 10397959071664799177, - 3331616814639908945, - 4252459885611162121, - 5517786723806029201, - 1826620401370703815, - 8257849352373689773, - 1722805960790112693, - 17654983138917187833, - 7542660006721409612, - 1970182718241277021, - 12865815507550811641, - 17507096607056552658, - 7988714902687660369, - 150082662759625574, - 17329095993317360383, - 965880604543562997, - 2820931239306841741, - 1980667983336380501, - 3781794112174728826, - 7323192150179872391, - 12243426826276589932, - 315076483410634889, - 3221894784246078707, - 3515955216509190252, - 964376148920419876, - 7679719864273407732, - 2516714701741920303, - 4837221266652621366, - 15301563603415983061, - 10380321314559647625, - 3023678426639670063, - 12020917879204725519, - 10595808165609787680, - 14199186729378048831, - 4520610719509879248, - 9983949546821718635, - 5066092593424854949, - 13843503196305181790, - 14296362815835302652, - 6766348697864530153, - 13804582129741554661, - 8032169955336281598, - 5198513488794721460, - 10613667919514788349, - 7948289550930596506, - 14118391408956101449, - 4356952068887595371, - 709878153008378134, - 17168579964784489802, - 17840495726541494819, - 2710471020841761312, - 9950159372116756450, - 3909574932971200058, - 2430964021804554670, - 6035162446515244642, - 14656543530572478095, - 1539013407173403800, - 4150113154618904744, - 4904646199269229662, - 17257014030727492672, - 3791823431764085889, - 13680668409434600948, - 12367427987617118934, - 12462908457168650050, - 10891613749697412017, - 6867760775372053830, - 12474954319307005079, - ]); - - assert_eq!(output, expected); - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs deleted file mode 100644 index 89da79e45..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon1.rs +++ /dev/null @@ -1,1143 +0,0 @@ -//! Poseidon1 permutation for Goldilocks. -//! -//! # Overview -//! -//! This module provides the Poseidon1 hash permutation instantiated for the -//! Goldilocks field (p = 2^64 - 2^32 + 1). The public API is a single type -//! alias that transparently dispatches to the best available implementation. -//! -//! # Platform Dispatch -//! -//! On **aarch64**, the type alias resolves to a dual-dispatch wrapper: -//! scalar permutations use NEON-accelerated MDS for full rounds with -//! LLVM-optimized sparse partial rounds, while packed NEON permutations -//! use the fused dual-lane ASM path (w8) or per-lane scalar path (w12). -//! -//! On **all other platforms**, it resolves to the generic Poseidon1 -//! implementation with Karatsuba MDS convolution. -//! -//! No `#[cfg]` is needed in calling code. -//! -//! # MDS Matrix -//! -//! The MDS matrix is a **circulant** matrix sourced from the MDS crate. -//! At runtime, it is applied via fast Karatsuba convolution (sub-O(t^2)). -//! During initialization only, it is expanded to dense form for the -//! sparse matrix decomposition of partial rounds. -//! -//! # Round Constants -//! -//! Generated by the Grain LFSR (Poseidon1 paper, Appendix E) with SBOX=0 (x^alpha encoding). - -use p3_poseidon1::{ - Poseidon1, Poseidon1Constants, Poseidon1ExternalLayerGeneric, Poseidon1InternalLayerGeneric, -}; - -use crate::mds::{MATRIX_CIRC_MDS_8_COL, MATRIX_CIRC_MDS_12_COL}; -use crate::{Goldilocks, MdsMatrixGoldilocks}; - -/// S-box degree for Goldilocks Poseidon1. -/// -/// The S-box raises each element to this power. The Goldilocks prime -/// factors as `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. Neither 3 nor 5 -/// are coprime to `p - 1`, so the smallest valid exponent is 7. -pub const GOLDILOCKS_S_BOX_DEGREE: u64 = 7; - -/// Number of full rounds per half for Goldilocks Poseidon (`RF / 2`). -/// -/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending). -/// Follows the Poseidon paper's security analysis (Section 5.4) with a +2 RF margin. -pub const GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS: usize = 4; - -/// Number of partial rounds for Goldilocks Poseidon (width 8). -/// -/// Derived from the interpolation bound in the Poseidon paper (Eq. 3): -/// -/// R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5 -/// = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20 -/// -/// With the +7.5% security margin (Section 5.4): ⌈1.075 × 20⌉ = 22. -pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8: usize = 22; - -/// Number of partial rounds for Goldilocks Poseidon (width 12). -/// -/// Same interpolation bound as width 8: -/// -/// R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20 -/// -/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22. -pub const GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12: usize = 22; - -/// Generic (non-fused) Poseidon1 permutation for Goldilocks. -/// -/// Uses the platform-independent Poseidon1 implementation with Karatsuba -/// MDS convolution. Used directly for widths not supported by the fused -/// type (e.g. 16, 24) and as the non-aarch64 fallback for widths 8 and 12. -pub type Poseidon1GoldilocksGeneric = Poseidon1< - Goldilocks, - Poseidon1ExternalLayerGeneric, - Poseidon1InternalLayerGeneric, - WIDTH, - GOLDILOCKS_S_BOX_DEGREE, ->; - -/// Unified Poseidon1 permutation for Goldilocks. -/// -/// On aarch64, resolves to a dual-dispatch wrapper: scalar permutations -/// use NEON MDS for full rounds with sparse partial rounds, packed NEON -/// permutations use fused dual-lane ASM (w8) or per-lane scalar (w12). -/// -/// On all other platforms, resolves to the generic implementation with -/// Karatsuba MDS convolution. -/// -/// Supports both scalar and packed state representations transparently. -#[cfg(target_arch = "aarch64")] -pub type Poseidon1Goldilocks = crate::Poseidon1GoldilocksDispatch; - -/// Unified Poseidon1 permutation for Goldilocks. -/// -/// On aarch64, resolves to the fused ASM-optimized implementation that -/// uses inline assembly and dual-lane NEON processing. -/// -/// On all other platforms, resolves to the generic implementation with -/// Karatsuba MDS convolution. -/// -/// Supports both scalar and packed state representations transparently. -#[cfg(not(target_arch = "aarch64"))] -pub type Poseidon1Goldilocks = Poseidon1GoldilocksGeneric; - -/// Round constants for width-8 Poseidon1 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 -/// -/// Generated by `poseidon/generate_constants.py --field goldilocks --width 8`. -/// -/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)]. -pub const GOLDILOCKS_POSEIDON1_RC_8: [[Goldilocks; 8]; 30] = Goldilocks::new_2d_array([ - // Initial full rounds (4) - [ - 0xdd5743e7f2a5a5d9, - 0xcb3a864e58ada44b, - 0xffa2449ed32f8cdc, - 0x42025f65d6bd13ee, - 0x7889175e25506323, - 0x34b98bb03d24b737, - 0xbdcc535ecc4faa2a, - 0x5b20ad869fc0d033, - ], - [ - 0xf1dda5b9259dfcb4, - 0x27515210be112d59, - 0x4227d1718c766c3f, - 0x26d333161a5bd794, - 0x49b938957bf4b026, - 0x4a56b5938b213669, - 0x1120426b48c8353d, - 0x6b323c3f10a56cad, - ], - [ - 0xce57d6245ddca6b2, - 0xb1fc8d402bba1eb1, - 0xb5c5096ca959bd04, - 0x6db55cd306d31f7f, - 0xc49d293a81cb9641, - 0x1ce55a4fe979719f, - 0xa92e60a9d178a4d1, - 0x002cc64973bcfd8c, - ], - [ - 0xcea721cce82fb11b, - 0xe5b55eb8098ece81, - 0x4e30525c6f1ddd66, - 0x43c6702827070987, - 0xaca68430a7b5762a, - 0x3674238634df9c93, - 0x88cee1c825e33433, - 0xde99ae8d74b57176, - ], - // Partial rounds (22) - [ - 0x488897d85ff51f56, - 0x1140737ccb162218, - 0xa7eeb9215866ed35, - 0x9bd2976fee49fcc9, - 0xc0c8f0de580a3fcc, - 0x4fb2dae6ee8fc793, - 0x343a89f35f37395b, - 0x223b525a77ca72c8, - ], - [ - 0x56ccb62574aaa918, - 0xc4d507d8027af9ed, - 0xa080673cf0b7e95c, - 0xf0184884eb70dcf8, - 0x044f10b0cb3d5c69, - 0xe9e3f7993938f186, - 0x1b761c80e772f459, - 0x606cec607a1b5fac, - ], - [ - 0x14a0c2e1d45f03cd, - 0x4eace8855398574f, - 0xf905ca7103eff3e6, - 0xf8c8f8d20862c059, - 0xb524fe8bdd678e5a, - 0xfbb7865901a1ec41, - 0x014ef1197d341346, - 0x9725e20825d07394, - ], - [ - 0xfdb25aef2c5bae3b, - 0xbe5402dc598c971e, - 0x93a5711f04cdca3d, - 0xc45a9a5b2f8fb97b, - 0xfe8946a924933545, - 0x2af997a27369091c, - 0xaa62c88e0b294011, - 0x058eb9d810ce9f74, - ], - [ - 0xb3cb23eced349ae4, - 0xa3648177a77b4a84, - 0x43153d905992d95d, - 0xf4e2a97cda44aa4b, - 0x5baa2702b908682f, - 0x082923bdf4f750d1, - 0x98ae09a325893803, - 0xf8a6475077968838, - ], - [ - 0xceb0735bf00b2c5f, - 0x0a1a5d953888e072, - 0x2fcb190489f94475, - 0xb5be06270dec69fc, - 0x739cb934b09acf8b, - 0x537750b75ec7f25b, - 0xe9dd318bae1f3961, - 0xf7462137299efe1a, - ], - [ - 0xb1f6b8eee9adb940, - 0xbdebcc8a809dfe6b, - 0x40fc1f791b178113, - 0x3ac1c3362d014864, - 0x9a016184bdb8aeba, - 0x95f2394459fbc25e, - 0xe3f34a07a76a66c2, - 0x8df25f9ad98b1b96, - ], - [ - 0x85ffc27171439d9d, - 0xddcb9a2dcfd26910, - 0x26b5ba4bf3afb94e, - 0xffff9cc7c7651e2f, - 0x8c88364698280b55, - 0xebc114167b910501, - 0x2d77b4d89ecfb516, - 0x332e0828eba151f2, - ], - [ - 0x46fa6a6450dd4735, - 0xd00db7dd92384a33, - 0x5fd4fb751f3a5fc5, - 0x496fb90c0bb65ea2, - 0xf3baec0bb87cc5c7, - 0x862a3c0a7d4c7713, - 0xbf5f38336a3f47d8, - 0x41ad9dbc1394a20c, - ], - [ - 0xcc535945b7dbf0f7, - 0x82af2bc93685bcec, - 0x8e4c8d0c8cebfccd, - 0x17cb39417e84597e, - 0xd4a965a8c749b232, - 0xa2cab040f33f3ee5, - 0xa98811a1fed4e3a6, - 0x1cc48b54f377e2a1, - ], - [ - 0xe40cd4f6c5609a27, - 0x11de79ebca97a4a4, - 0x9177c73d8b7e929d, - 0x2a6fe8085797e792, - 0x3de6e93329f8d5ae, - 0x3f7af9125da962ff, - 0xd710682cfc77d3ac, - 0x48faf05f3b053cf4, - ], - [ - 0x287db8630da89c8b, - 0x4d0de32053cb30e9, - 0x8b37a4f20c5ada7b, - 0xe7cc6ebe78c84ecf, - 0x240bdc0a66a2610d, - 0x8299e7f02caa1650, - 0x380a53fefb6e754e, - 0x684a1d8cf8eb6810, - ], - [ - 0xe839452eb4b8a5e1, - 0xb03fa62e90626af4, - 0x11a688602fbc5efc, - 0x30dda75c355a2d62, - 0x0f712adcb73810de, - 0xffdc1102187f1ae1, - 0x40c34f398254b99c, - 0xede021b9dc289a4a, - ], - [ - 0x8b7b05225c4e7dad, - 0x3bc794346f9d9ff9, - 0xfccb5a57f2ca86ff, - 0xbb1502015a7da9d4, - 0xd7e0a35d4352a015, - 0x27af7a44f8160931, - 0xc37442f6782f4615, - 0xbdf392a9bd095dcb, - ], - [ - 0xc17f55037cf00de9, - 0xbcffedd34c71a874, - 0x5eb45d2a8133d1f2, - 0xbabe251e1612ebdf, - 0x3efeb9fbe438c536, - 0x2d7cef97b4afe1cf, - 0xe5de1b4660016c0b, - 0xcdcc26c332f5657c, - ], - [ - 0xe01dd653daf15809, - 0xb0a6bdd4b41094b5, - 0x27eac858b0b03a05, - 0x51d43b5e93adbdc0, - 0x8b89a23b0fea5fc9, - 0xdc8ac3b14f7f2fc1, - 0xe793f82f1efec039, - 0x9f6f2cf8969e7b80, - ], - [ - 0x49d45382e0f21d4a, - 0x5f4ad1797cd72786, - 0x4dc3dbebfd45f795, - 0x03a3ef84dba6e1bc, - 0x204bc9b3d3fc4c01, - 0x9ad706081e89b9ba, - 0x638bfb4d840e9f89, - 0x5ef2938cd095ae35, - ], - [ - 0x42cca18ebeb265c8, - 0xb7b2ec5c29aecbf8, - 0x0d84f9535dc78f0f, - 0x04e64ad942e77b8c, - 0xb4880dffffc9da0b, - 0x16db16d9c29adeb1, - 0x09bbaf2a0590cd1e, - 0x76460e74961fcf8d, - ], - [ - 0xed12a2276dfa1553, - 0x0b5acec5de0436fd, - 0x3c6cfea033a1f0a8, - 0x2b5ecefe546cac15, - 0x6e2d82884cd3bf6f, - 0xc134878d1add7b83, - 0x997963422eb7a280, - 0x5e834537ac648cf6, - ], - [ - 0x89e779214737c0b7, - 0x1a8c05e8581ad95b, - 0x8d18b72796437cf7, - 0xe7252c949e04b106, - 0x53267c4fd174585a, - 0xa16ef5d9c81dad47, - 0xda65191937270a46, - 0xcb2a5b55f2df664c, - ], - [ - 0x854aee2dc1924137, - 0xf37013c9d479ece6, - 0x0e163bc0630c4696, - 0x384ee64955048f76, - 0xf65d814e28ee4ec5, - 0xe57bc564fd82f1b1, - 0x4b338937b6876614, - 0x66ee0b04ed43cd8d, - ], - [ - 0x49884bf25f4ef15d, - 0xeb51fe28de1c6f54, - 0x2cd64e84fce8dfcc, - 0x29164a96a541a013, - 0x173ce7558f4cacb8, - 0xeb5b1ce5877c89e9, - 0x5faff4b0f5217bf6, - 0xac42d0b1c20f205e, - ], - // Terminal full rounds (4) - [ - 0xfb1d6bf0ca43221b, - 0x97b0a1b01d6a2955, - 0x08c60bd622952b30, - 0x43f2be0f9e24147c, - 0xfa7268b7d3730f5d, - 0x43a6c419a23983bb, - 0xcd77c1f7b29b113c, - 0xcfa43c9db8eec29f, - ], - [ - 0xcaaa95a6c7365dec, - 0x0a91193f798f3be0, - 0x1104497652735dc6, - 0x35aecb93663b515e, - 0x8dbc9916065aa858, - 0xada8f7a0266579ed, - 0x524dee7bec1ea789, - 0xa93aee9dd5af9521, - ], - [ - 0x9d1f1b54750d707e, - 0x7c9feab87096d5dc, - 0xa2e1fb19f9d4261b, - 0xb714deb448de6346, - 0x225d1f0d011c5403, - 0x1549b7f1d28cedc0, - 0xaef3e46f97d43942, - 0x6dfc7ffe0b38bf08, - ], - [ - 0x7de853fdc542b663, - 0xa68ecc96610657b2, - 0xe88bb5428af289b1, - 0xd7cfa1504c5569f5, - 0x78a9aad0d642d30a, - 0xd68315f2353dce52, - 0x46e56300f86fcfd5, - 0x323d95332b145fd6, - ], -]); - -/// Round constants for width-12 Poseidon1 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 -/// -/// Generated by `poseidon/generate_constants.py --field goldilocks --width 12`. -/// -/// Layout: [initial_full (4 rounds), partial (22 rounds), terminal_full (4 rounds)]. -pub const GOLDILOCKS_POSEIDON1_RC_12: [[Goldilocks; 12]; 30] = Goldilocks::new_2d_array([ - // Initial full rounds (4) - [ - 0x13dcf33aba214f46, - 0x30b3b654a1da6d83, - 0x1fc634ada6159b56, - 0x937459964dc03466, - 0xedd2ef2ca7949924, - 0xede9affde0e22f68, - 0x8515b9d6bac9282d, - 0x6b5c07b4e9e900d8, - 0x1ec66368838c8a08, - 0x9042367d80d1fbab, - 0x400283564a3c3799, - 0x4a00be0466bca75e, - ], - [ - 0x7913beee58e3817f, - 0xf545e88532237d90, - 0x22f8cb8736042005, - 0x6f04990e247a2623, - 0xfe22e87ba37c38cd, - 0xd20e32c85ffe2815, - 0x117227674048fe73, - 0x4e9fb7ea98a6b145, - 0xe0866c232b8af08b, - 0x00bbc77916884964, - 0x7031c0fb990d7116, - 0x240a9e87cf35108f, - ], - [ - 0x2e6363a5a12244b3, - 0x5e1c3787d1b5011c, - 0x4132660e2a196e8b, - 0x3a013b648d3d4327, - 0xf79839f49888ea43, - 0xfe85658ebafe1439, - 0xb6889825a14240bd, - 0x578453605541382b, - 0x4508cda8f6b63ce9, - 0x9c3ef35848684c91, - 0x0812bde23c87178c, - 0xfe49638f7f722c14, - ], - [ - 0x8e3f688ce885cbf5, - 0xb8e110acf746a87d, - 0xb4b2e8973a6dabef, - 0x9e714c5da3d462ec, - 0x6438f9033d3d0c15, - 0x24312f7cf1a27199, - 0x23f843bb47acbf71, - 0x9183f11a34be9f01, - 0x839062fbb9d45dbf, - 0x24b56e7e6c2e43fa, - 0xe1683da61c962a72, - 0xa95c63971a19bfa7, - ], - // Partial rounds (22) - [ - 0x4adf842aa75d4316, - 0xf8fbb871aa4ab4eb, - 0x68e85b6eb2dd6aeb, - 0x07a0b06b2d270380, - 0xd94e0228bd282de4, - 0x8bdd91d3250c5278, - 0x209c68b88bba778f, - 0xb5e18cdab77f3877, - 0xb296a3e808da93fa, - 0x8370ecbda11a327e, - 0x3f9075283775dad8, - 0xb78095bb23c6aa84, - ], - [ - 0x3f36b9fe72ad4e5f, - 0x69bc96780b10b553, - 0x3f1d341f2eb7b881, - 0x4e939e9815838818, - 0xda366b3ae2a31604, - 0xbc89db1e7287d509, - 0x6102f411f9ef5659, - 0x58725c5e7ac1f0ab, - 0x0df5856c798883e7, - 0xf7bb62a8da4c961b, - 0xc68be7c94882a24d, - 0xaf996d5d5cdaedd9, - ], - [ - 0x9717f025e7daf6a5, - 0x6436679e6e7216f4, - 0x8a223d99047af267, - 0xbb512e35a133ba9a, - 0xfbbf44097671aa03, - 0xf04058ebf6811e61, - 0x5cca84703fac7ffb, - 0x9b55c7945de6469f, - 0x8e05bf09808e934f, - 0x2ea900de876307d7, - 0x7748fff2b38dfb89, - 0x6b99a676dd3b5d81, - ], - [ - 0xac4bb7c627cf7c13, - 0xadb6ebe5e9e2f5ba, - 0x2d33378cafa24ae3, - 0x1e5b73807543f8c2, - 0x09208814bfebb10f, - 0x782e64b6bb5b93dd, - 0xadd5a48eac90b50f, - 0xadd4c54c736ea4b1, - 0xd58dbb86ed817fd8, - 0x6d5ed1a533f34ddd, - 0x28686aa3e36b7cb9, - 0x591abd3476689f36, - ], - [ - 0x047d766678f13875, - 0xa2a11112625f5b49, - 0x21fd10a3f8304958, - 0xf9b40711443b0280, - 0xd2697eb8b2bde88e, - 0x3493790b51731b3f, - 0x11caf9dd73764023, - 0x7acfb8f72878164e, - 0x744ec4db23cefc26, - 0x1e00e58f422c6340, - 0x21dd28d906a62dda, - 0xf32a46ab5f465b5f, - ], - [ - 0xbfce13201f3f7e6b, - 0xf30d2e7adb5304e2, - 0xecdf4ee4abad48e9, - 0xf94e82182d395019, - 0x4ee52e3744d887c5, - 0xa1341c7cac0083b2, - 0x2302fb26c30c834a, - 0xaea3c587273bf7d3, - 0xf798e24961823ec7, - 0x962deba3e9a2cd94, - 0xb36ee79485ca4707, - 0xd380199eddd2de52, - ], - [ - 0x70971fc4e6f85305, - 0x8e722f6e5dc32699, - 0xa0883df133052b92, - 0x8f86c6a3eb7d01a4, - 0x763649c8b670bdc5, - 0x830d5c82b808759b, - 0xaa1da8bb91da02e7, - 0x9bc9bf629e211c4d, - 0x0f0a899b10a4dea8, - 0xb883bdcee7c6b356, - 0x78c7101e7496ae1e, - 0x2fd6c5a8bf1e5ca6, - ], - [ - 0xe2a6e06e61fcec9c, - 0xebfce7d5c5b3dbd5, - 0xca2eeca4bb485d85, - 0xc2b875537c42eb69, - 0x6faf849976873328, - 0xfc3fcb6e81ad4cc3, - 0x180dd95503955a28, - 0xd40f19a3c9fe1520, - 0x49d178ddbf7fd96d, - 0x3950bee2e10e0297, - 0x437b90cf295be062, - 0xa5cd126edffad23b, - ], - [ - 0xdf58134c134491c2, - 0x0677eca229d9f7bd, - 0x492200a1f7d83a3c, - 0xafb58c9810a43645, - 0x7659077c5a9c208e, - 0x30b4bc83706995cd, - 0xc98fa77bbbef3a3b, - 0x84a82905750b3109, - 0x72f2a02326aeb69b, - 0x8d27a2a2d73a848a, - 0xaa9e30a80bde4b68, - 0x63abb1415e050474, - ], - [ - 0x1c4bd1e816050a7e, - 0x15d1502e4f469dfd, - 0x53989d594b0c4cd8, - 0x7a1a4c83cb7e377e, - 0x1b52f8a9944e480e, - 0xeb7b03f76a91a79e, - 0x0073a4fc9328c69e, - 0x2c7b16f8620d9de4, - 0x950d052963e46bc4, - 0x8d201ba1a9c89fac, - 0xd3502941bdf35503, - 0x7c6dfcd5af8676fb, - ], - [ - 0xf8a6cd02e92cdb0b, - 0x6e7500f3a5464b22, - 0x07637eabba4bdd20, - 0x88b82717beee0e14, - 0xbaa2b1cd3dd4c79a, - 0xdfecc3aebec4cfa6, - 0x7561087b0cff0166, - 0x538fcac317a703a6, - 0xd7d6c6eeeeeeea19, - 0xd647b1ee441658a0, - 0xdf4442110236c546, - 0x559ef2c6dd73ec15, - ], - [ - 0x4c0f5fc6c0dda3d1, - 0x685010cc3100cea7, - 0x2fb6ba8aa0344440, - 0xb515f0a3ca75f1fb, - 0x886887eaecb87c10, - 0xf03ec3fd710abb04, - 0xd3b4763e17f543ef, - 0x50d9e5716e78083a, - 0x0bce2385cf8d74ff, - 0xaf23032cd5f0e04b, - 0xd366aa112b6159d9, - 0x810a3ad3ac7979db, - ], - [ - 0x0a4a11d794be40a2, - 0xeebf0cf23b668a3f, - 0x600873fb011d761b, - 0x0bfb5591a02ff618, - 0xa16e2a528910af52, - 0xf6553653e2878421, - 0xccbe7c7a601a30c0, - 0xb18b214fe489f5b3, - 0xe21017ab9e153425, - 0x586099ede17af9a6, - 0x385078b514f50647, - 0xc02b3a9afb89883d, - ], - [ - 0x6d3fbd3b4a9f1de6, - 0x4b4d40a41b0f473c, - 0x838f1887b8f31711, - 0x9396895be5c58a41, - 0x6247a479d66fc2e3, - 0x13fe228a98f2d0a2, - 0x5ba5fde765f9481e, - 0xafb89fa62267e117, - 0xfa4dc1bebcaa6333, - 0xdbab590882b87289, - 0xc3b6c08e23ba9301, - 0xd84b5de94a324fb7, - ], - [ - 0x0d0c371c5b35b850, - 0x7964f570e7188038, - 0x5daf18bbd996604c, - 0x6743bc47b9595258, - 0x5528b9362c59bb71, - 0xac45e25b7127b68c, - 0xa2077d7dfbb606b6, - 0xf3faac6faee378af, - 0x0c6388b51545e884, - 0xd27dbb6944917b61, - 0x89bcac584344c104, - 0x856bab802ce7402d, - ], - [ - 0x2cff3000be1fcd0a, - 0x765f2977fa72a917, - 0x1443711329f5f9d5, - 0xd35cd0261af2f951, - 0x2a1bb986084ec281, - 0x2334a54b758f23f2, - 0xa9b8cb612caf706b, - 0xb6ba11c4ab1a1017, - 0xde96b0824b4b46e2, - 0xc59d4272c6d92e2c, - 0x389bb5107611754d, - 0x23647fbc77657372, - ], - [ - 0xd5ef60d6f76a42fa, - 0xebb406bb79ac9819, - 0x55faccc709a2f423, - 0xd9d6ea97490091cd, - 0xef3ce5069647a7e4, - 0xdf31625d3fa78464, - 0x242e60fd68f10f66, - 0x39c966cc815f084d, - 0x20e2e22e02bae3f7, - 0xb38919d3f1173d7c, - 0xf17769f6c77084d9, - 0xcc051d8094cac41f, - ], - [ - 0x942069f5d6eece7e, - 0x8d61d3e6f141c572, - 0xc5cef9d85dd605f4, - 0x938f2ac2bf885997, - 0x23bddbace7c48f6c, - 0xc90a6c5ba98537e4, - 0x0be6ee2cca90f6ae, - 0xa026175394ae0e90, - 0x29fca3e314c77628, - 0x2aa2aa8738ab7b77, - 0xe11bbd31fbb8cac6, - 0xb5bbbef1b78a23af, - ], - [ - 0x8b62a5551e9a9797, - 0x3f91073d4d491c80, - 0x4cfa44976396424a, - 0xf8dcb2dfb3aa1b44, - 0x3849409eba1a95f5, - 0x070845799f234380, - 0x184c0093667da1ba, - 0xbd66aafccd51601e, - 0xee6d14e92155b490, - 0x626f2ec1865bc544, - 0x1bd2854bf6485986, - 0x368b8497472f12ef, - ], - [ - 0x4f88cdcdfb791921, - 0xe2c0acfeda9ae781, - 0x9739bc21773469b3, - 0x00ce3ad64dc4bb8f, - 0xaab85a321ee7a4c8, - 0xd5de825be97004f4, - 0x48d676d3a043b1c6, - 0x9c6180b1ff643097, - 0x34882a89dd590b09, - 0xae7e6b0d249c3b1d, - 0x8c016908a04885a1, - 0x83ebaaebc9ae0721, - ], - [ - 0xab21b42e0f642307, - 0xdb46631f62bb29c1, - 0xef29f0399e09b5d9, - 0x5b52fbb3613b8ba1, - 0x57e129fcc96922e6, - 0xcdeb14c9d9204b3a, - 0x1341ef0da8536e34, - 0xd7e3400f2bacde63, - 0x6911eeb42f70d7e5, - 0xc3a2a910a4679767, - 0x1773cbe4a0f6bb28, - 0xe17b0d53e843eab5, - ], - [ - 0x587fa39990b62800, - 0x0d5d32788135879d, - 0x277f7b31fd3a4cdb, - 0xa435290ee56d7efa, - 0xea6f40be35159925, - 0xcb73377a506171cb, - 0xe43c367ce731d82a, - 0x6eb305031ca10c43, - 0xc019a8c622cc84cb, - 0xd5614f5658c612e6, - 0x7b1ecbe957c3ff98, - 0x60db6ee9651a8478, - ], - // Terminal full rounds (4) - [ - 0x9271d450fc9b4117, - 0xcffeea06b6e3aac1, - 0xfa4a44c748d1cd8e, - 0xe64db01ba569b469, - 0xd31005160e4045fe, - 0x39e0fa013e025f79, - 0xe243be574196a956, - 0x205b2a681e3d2642, - 0x79cae5ad93486bab, - 0xfdf567844e32c295, - 0x331679589bfb7189, - 0xaf06ee32297b89c2, - ], - [ - 0xa6bcae311e498491, - 0x9d16f52c96ac8b3e, - 0x48a674b59393fa35, - 0x0f9e65da3fde3796, - 0x1e098310fc84578c, - 0x559ae5fab1ae8dad, - 0x56bd4d624078881d, - 0xfd8bbbf8fbe817b5, - 0x82d30695c44df534, - 0x3ec0a97bc41127c5, - 0x1eb8b64adaa22078, - 0x82c45e418d60c983, - ], - [ - 0xb092280f484d55bf, - 0xcd317c9537697939, - 0xd3be2e352feb79f3, - 0xca6d866539a390e5, - 0xb5efb1a494e55ee6, - 0xfa9013ac89756e9e, - 0xaeb88efd1e981242, - 0x13ee477cdab6e0dc, - 0xce7df902c40da2d3, - 0xf3fbaf0d4e6f5f34, - 0xf96354ada6785f38, - 0x13b5692812406886, - ], - [ - 0xf03cae030a0f4418, - 0x7d3172887aa98e1a, - 0x8a2c2644f2faf7b9, - 0x80d721abee696d00, - 0x27c8b903a4d68267, - 0xaf0b7b12f90291b8, - 0x00acd08cfdff3817, - 0x4659ee496c634328, - 0xf5b25c10730dbff1, - 0xdde3a153297329c2, - 0x50c0b70d6910a44b, - 0x23c7426af725a6a0, - ], -]); - -/// Create the default width-8 Poseidon1 permutation for Goldilocks. -/// -/// Returns the platform-optimal implementation: dual-dispatch on aarch64 -/// (generic for scalar, fused ASM for packed), generic Karatsuba on all -/// other platforms. -#[cfg(target_arch = "aarch64")] -pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> { - let constants = Poseidon1Constants { - rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - mds_circ_col: MATRIX_CIRC_MDS_8_COL, - round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(), - }; - let (full, partial) = constants.to_optimized(); - let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial); - crate::Poseidon1GoldilocksDispatch::new(fused, full, partial) -} - -/// Create the default width-8 Poseidon1 permutation for Goldilocks. -/// -/// Returns the platform-optimal implementation: fused ASM on aarch64, -/// generic Karatsuba on all other platforms. -#[cfg(not(target_arch = "aarch64"))] -pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks<8> { - Poseidon1::new(&Poseidon1Constants { - rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - mds_circ_col: MATRIX_CIRC_MDS_8_COL, - round_constants: GOLDILOCKS_POSEIDON1_RC_8.to_vec(), - }) -} - -/// Create the default width-12 Poseidon1 permutation for Goldilocks. -/// -/// Returns the platform-optimal implementation: dual-dispatch on aarch64 -/// (generic for scalar, fused ASM for packed), generic Karatsuba on all -/// other platforms. -#[cfg(target_arch = "aarch64")] -pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> { - let constants = Poseidon1Constants { - rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, - mds_circ_col: MATRIX_CIRC_MDS_12_COL, - round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(), - }; - let (full, partial) = constants.to_optimized(); - let fused = crate::Poseidon1GoldilocksFused::new(&full, &partial); - crate::Poseidon1GoldilocksDispatch::new(fused, full, partial) -} - -/// Create the default width-12 Poseidon1 permutation for Goldilocks. -/// -/// Returns the platform-optimal implementation: fused ASM on aarch64, -/// generic Karatsuba on all other platforms. -#[cfg(not(target_arch = "aarch64"))] -pub fn default_goldilocks_poseidon1_12() -> Poseidon1Goldilocks<12> { - Poseidon1::new(&Poseidon1Constants { - rounds_f: 2 * GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - rounds_p: GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_12, - mds_circ_col: MATRIX_CIRC_MDS_12_COL, - round_constants: GOLDILOCKS_POSEIDON1_RC_12.to_vec(), - }) -} - -#[cfg(test)] -mod tests { - use p3_symmetric::Permutation; - use rand::SeedableRng; - use rand::rngs::SmallRng; - - use super::*; - - type F = Goldilocks; - - /// Known-answer test for width 8 (sequential 0..7 input). - #[test] - fn test_poseidon_goldilocks_width_8() { - let perm = default_goldilocks_poseidon1_8(); - - let mut input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]); - perm.permute_mut(&mut input); - - let expected: [F; 8] = F::new_array([ - 2431226948502761687, - 9427563026145807618, - 6827549936272051660, - 16907684411084503785, - 10131745626715172913, - 17448305483431576765, - 9066501914269485014, - 12095238468458521303, - ]); - assert_eq!(input, expected); - } - - /// Known-answer test for width 12 (sequential 0..11 input). - #[test] - fn test_poseidon_goldilocks_width_12() { - let perm = default_goldilocks_poseidon1_12(); - - let mut input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); - perm.permute_mut(&mut input); - - let expected: [F; 12] = F::new_array([ - 15595088881848875364, - 9564850329150784619, - 13607005230761744521, - 12117102595842533385, - 2814257411756993122, - 11640647689983397089, - 14363867760831937423, - 13323891071259596526, - 11219803511311150468, - 9221595262780869902, - 5898229059046891887, - 18181291031484020550, - ]); - assert_eq!(input, expected); - } - - /// Smoke test for width 16 with random constants. - /// Uses the generic type directly since the fused type only supports 8 and 12. - #[test] - fn test_poseidon_goldilocks_width_16() { - let mut rng = SmallRng::seed_from_u64(1); - let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng( - GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - &MdsMatrixGoldilocks, - &mut rng, - ); - let input: [F; 16] = rand::RngExt::random(&mut rng); - let output = poseidon.permute(input); - assert_ne!(output, input); - } - - /// Smoke test for width 24 with random constants. - #[test] - fn test_poseidon_goldilocks_width_24() { - let mut rng = SmallRng::seed_from_u64(1); - let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng( - GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - &MdsMatrixGoldilocks, - &mut rng, - ); - let input: [F; 24] = rand::RngExt::random(&mut rng); - let output = poseidon.permute(input); - assert_ne!(output, input); - } - - #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] - mod avx512 { - use super::*; - use crate::PackedGoldilocksAVX512; - - #[test] - fn test_avx512_poseidon_width_16() { - let mut rng = SmallRng::seed_from_u64(1); - let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng( - GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - &MdsMatrixGoldilocks, - &mut rng, - ); - let input: [F; 16] = rand::RngExt::random(&mut rng); - - let mut expected = input; - poseidon.permute_mut(&mut expected); - - let mut avx512_input = input.map(Into::::into); - poseidon.permute_mut(&mut avx512_input); - - let avx512_output = avx512_input.map(|x| x.0[0]); - assert_eq!(avx512_output, expected); - } - - #[test] - fn test_avx512_poseidon_width_24() { - let mut rng = SmallRng::seed_from_u64(1); - let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng( - GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - &MdsMatrixGoldilocks, - &mut rng, - ); - let input: [F; 24] = rand::RngExt::random(&mut rng); - - let mut expected = input; - poseidon.permute_mut(&mut expected); - - let mut avx512_input = input.map(Into::::into); - poseidon.permute_mut(&mut avx512_input); - - let avx512_output = avx512_input.map(|x| x.0[0]); - assert_eq!(avx512_output, expected); - } - } - - #[cfg(all( - target_arch = "x86_64", - target_feature = "avx2", - not(target_feature = "avx512f") - ))] - mod avx2 { - use super::*; - use crate::PackedGoldilocksAVX2; - - #[test] - fn test_avx2_poseidon_width_16() { - let mut rng = SmallRng::seed_from_u64(1); - let poseidon = Poseidon1GoldilocksGeneric::<16>::new_from_rng( - GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - &MdsMatrixGoldilocks, - &mut rng, - ); - let input: [F; 16] = rand::RngExt::random(&mut rng); - - let mut expected = input; - poseidon.permute_mut(&mut expected); - - let mut avx2_input = input.map(Into::::into); - poseidon.permute_mut(&mut avx2_input); - - let avx2_output = avx2_input.map(|x| x.0[0]); - assert_eq!(avx2_output, expected); - } - - #[test] - fn test_avx2_poseidon_width_24() { - let mut rng = SmallRng::seed_from_u64(1); - let poseidon = Poseidon1GoldilocksGeneric::<24>::new_from_rng( - GOLDILOCKS_POSEIDON_HALF_FULL_ROUNDS, - GOLDILOCKS_POSEIDON_PARTIAL_ROUNDS_8, - &MdsMatrixGoldilocks, - &mut rng, - ); - let input: [F; 24] = rand::RngExt::random(&mut rng); - - let mut expected = input; - poseidon.permute_mut(&mut expected); - - let mut avx2_input = input.map(Into::::into); - poseidon.permute_mut(&mut avx2_input); - - let avx2_output = avx2_input.map(|x| x.0[0]); - assert_eq!(avx2_output, expected); - } - } - - #[cfg(target_arch = "aarch64")] - mod neon { - use super::*; - use crate::PackedGoldilocksNeon; - - #[test] - fn test_neon_poseidon_width_8() { - let perm = default_goldilocks_poseidon1_8(); - let input: [F; 8] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7]); - - let mut expected = input; - perm.permute_mut(&mut expected); - - let mut neon_input = input.map(Into::::into); - perm.permute_mut(&mut neon_input); - - let neon_output = neon_input.map(|x| x.0[0]); - assert_eq!(neon_output, expected); - } - - #[test] - fn test_neon_poseidon_width_12() { - let perm = default_goldilocks_poseidon1_12(); - let input: [F; 12] = F::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); - - let mut expected = input; - perm.permute_mut(&mut expected); - - let mut neon_input = input.map(Into::::into); - perm.permute_mut(&mut neon_input); - - let neon_output = neon_input.map(|x| x.0[0]); - assert_eq!(neon_output, expected); - } - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs deleted file mode 100644 index b5d158610..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/poseidon2.rs +++ /dev/null @@ -1,980 +0,0 @@ -//! Implementation of Poseidon2, see: https://eprint.iacr.org/2023/323 - -use alloc::vec::Vec; - -use p3_field::{Algebra, InjectiveMonomial, PrimeCharacteristicRing}; -#[cfg(not(target_arch = "aarch64"))] -use p3_poseidon2::Poseidon2; -use p3_poseidon2::{ - ExternalLayer, ExternalLayerConstants, ExternalLayerConstructor, GenericPoseidon2LinearLayers, - InternalLayer, InternalLayerConstructor, MDSMat4, add_rc_and_sbox_generic, - external_initial_permute_state, external_terminal_permute_state, internal_permute_state, - matmul_internal, -}; - -use crate::Goldilocks; -use crate::poseidon1::GOLDILOCKS_S_BOX_DEGREE; - -/// Number of full rounds per half for Goldilocks Poseidon2 (`RF / 2`). -/// -/// The total number of full rounds is `RF = 8` (4 beginning + 4 ending). -/// Follows the Poseidon2 paper's security analysis with a +2 RF margin. -pub const GOLDILOCKS_POSEIDON2_HALF_FULL_ROUNDS: usize = 4; - -/// Number of partial rounds for Goldilocks Poseidon2 (width 8). -/// -/// Derived from the interpolation bound in the Poseidon paper (Eq. 3): -/// -/// R_interp ≥ ⌈min{κ,n}/log_2(α)⌉ + ⌈log_α(t)⌉ − 5 -/// = ⌈64/log_2(7)⌉ + ⌈log_7(8)⌉ − 5 = 23 + 2 − 5 = 20 -/// -/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22. -pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_8: usize = 22; - -/// Number of partial rounds for Goldilocks Poseidon2 (width 12). -/// -/// Same interpolation bound as width 8: -/// -/// R_interp ≥ ⌈64/log_2(7)⌉ + ⌈log_7(12)⌉ − 5 = 23 + 2 − 5 = 20 -/// -/// With the +7.5% security margin: ⌈1.075 × 20⌉ = 22. -pub const GOLDILOCKS_POSEIDON2_PARTIAL_ROUNDS_12: usize = 22; - -/// An implementation of the Poseidon2 hash function for the Goldilocks field. -/// -/// It acts on arrays of the form `[Goldilocks; WIDTH]`. -#[cfg(target_arch = "aarch64")] -pub type Poseidon2Goldilocks = crate::Poseidon2GoldilocksFused; - -/// An implementation of the Poseidon2 hash function for the Goldilocks field. -/// -/// It acts on arrays of the form `[Goldilocks; WIDTH]`. -#[cfg(not(target_arch = "aarch64"))] -pub type Poseidon2Goldilocks = Poseidon2< - Goldilocks, - Poseidon2ExternalLayerGoldilocks, - Poseidon2InternalLayerGoldilocks, - WIDTH, - GOLDILOCKS_S_BOX_DEGREE, ->; - -/// Round constants for width-8 Poseidon2 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 -/// -/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`. -/// -/// Layout: external_initial (4 rounds × 8 elements). -pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL: [[Goldilocks; 8]; 4] = [ - Goldilocks::new_array([ - 0xdd5743e7f2a5a5d9, - 0xcb3a864e58ada44b, - 0xffa2449ed32f8cdc, - 0x42025f65d6bd13ee, - 0x7889175e25506323, - 0x34b98bb03d24b737, - 0xbdcc535ecc4faa2a, - 0x5b20ad869fc0d033, - ]), - Goldilocks::new_array([ - 0xf1dda5b9259dfcb4, - 0x27515210be112d59, - 0x4227d1718c766c3f, - 0x26d333161a5bd794, - 0x49b938957bf4b026, - 0x4a56b5938b213669, - 0x1120426b48c8353d, - 0x6b323c3f10a56cad, - ]), - Goldilocks::new_array([ - 0xce57d6245ddca6b2, - 0xb1fc8d402bba1eb1, - 0xb5c5096ca959bd04, - 0x6db55cd306d31f7f, - 0xc49d293a81cb9641, - 0x1ce55a4fe979719f, - 0xa92e60a9d178a4d1, - 0x002cc64973bcfd8c, - ]), - Goldilocks::new_array([ - 0xcea721cce82fb11b, - 0xe5b55eb8098ece81, - 0x4e30525c6f1ddd66, - 0x43c6702827070987, - 0xaca68430a7b5762a, - 0x3674238634df9c93, - 0x88cee1c825e33433, - 0xde99ae8d74b57176, - ]), -]; - -/// Round constants for width-8 Poseidon2 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 -/// -/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`. -/// -/// Layout: external_final (4 rounds × 8 elements). -pub const GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL: [[Goldilocks; 8]; 4] = [ - Goldilocks::new_array([ - 0x014ef1197d341346, - 0x9725e20825d07394, - 0xfdb25aef2c5bae3b, - 0xbe5402dc598c971e, - 0x93a5711f04cdca3d, - 0xc45a9a5b2f8fb97b, - 0xfe8946a924933545, - 0x2af997a27369091c, - ]), - Goldilocks::new_array([ - 0xaa62c88e0b294011, - 0x058eb9d810ce9f74, - 0xb3cb23eced349ae4, - 0xa3648177a77b4a84, - 0x43153d905992d95d, - 0xf4e2a97cda44aa4b, - 0x5baa2702b908682f, - 0x082923bdf4f750d1, - ]), - Goldilocks::new_array([ - 0x98ae09a325893803, - 0xf8a6475077968838, - 0xceb0735bf00b2c5f, - 0x0a1a5d953888e072, - 0x2fcb190489f94475, - 0xb5be06270dec69fc, - 0x739cb934b09acf8b, - 0x537750b75ec7f25b, - ]), - Goldilocks::new_array([ - 0xe9dd318bae1f3961, - 0xf7462137299efe1a, - 0xb1f6b8eee9adb940, - 0xbdebcc8a809dfe6b, - 0x40fc1f791b178113, - 0x3ac1c3362d014864, - 0x9a016184bdb8aeba, - 0x95f2394459fbc25e, - ]), -]; - -/// Round constants for width-8 Poseidon2 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=8, R_F=8, R_P=22 -/// -/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 8`. -/// -/// Layout: internal (22 scalar constants). -pub const GOLDILOCKS_POSEIDON2_RC_8_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([ - 0x488897d85ff51f56, - 0x1140737ccb162218, - 0xa7eeb9215866ed35, - 0x9bd2976fee49fcc9, - 0xc0c8f0de580a3fcc, - 0x4fb2dae6ee8fc793, - 0x343a89f35f37395b, - 0x223b525a77ca72c8, - 0x56ccb62574aaa918, - 0xc4d507d8027af9ed, - 0xa080673cf0b7e95c, - 0xf0184884eb70dcf8, - 0x044f10b0cb3d5c69, - 0xe9e3f7993938f186, - 0x1b761c80e772f459, - 0x606cec607a1b5fac, - 0x14a0c2e1d45f03cd, - 0x4eace8855398574f, - 0xf905ca7103eff3e6, - 0xf8c8f8d20862c059, - 0xb524fe8bdd678e5a, - 0xfbb7865901a1ec41, -]); - -/// Round constants for width-12 Poseidon2 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 -/// -/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`. -/// -/// Layout: external_initial (4 rounds × 12 elements). -pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL: [[Goldilocks; 12]; 4] = [ - Goldilocks::new_array([ - 0x13dcf33aba214f46, - 0x30b3b654a1da6d83, - 0x1fc634ada6159b56, - 0x937459964dc03466, - 0xedd2ef2ca7949924, - 0xede9affde0e22f68, - 0x8515b9d6bac9282d, - 0x6b5c07b4e9e900d8, - 0x1ec66368838c8a08, - 0x9042367d80d1fbab, - 0x400283564a3c3799, - 0x4a00be0466bca75e, - ]), - Goldilocks::new_array([ - 0x7913beee58e3817f, - 0xf545e88532237d90, - 0x22f8cb8736042005, - 0x6f04990e247a2623, - 0xfe22e87ba37c38cd, - 0xd20e32c85ffe2815, - 0x117227674048fe73, - 0x4e9fb7ea98a6b145, - 0xe0866c232b8af08b, - 0x00bbc77916884964, - 0x7031c0fb990d7116, - 0x240a9e87cf35108f, - ]), - Goldilocks::new_array([ - 0x2e6363a5a12244b3, - 0x5e1c3787d1b5011c, - 0x4132660e2a196e8b, - 0x3a013b648d3d4327, - 0xf79839f49888ea43, - 0xfe85658ebafe1439, - 0xb6889825a14240bd, - 0x578453605541382b, - 0x4508cda8f6b63ce9, - 0x9c3ef35848684c91, - 0x0812bde23c87178c, - 0xfe49638f7f722c14, - ]), - Goldilocks::new_array([ - 0x8e3f688ce885cbf5, - 0xb8e110acf746a87d, - 0xb4b2e8973a6dabef, - 0x9e714c5da3d462ec, - 0x6438f9033d3d0c15, - 0x24312f7cf1a27199, - 0x23f843bb47acbf71, - 0x9183f11a34be9f01, - 0x839062fbb9d45dbf, - 0x24b56e7e6c2e43fa, - 0xe1683da61c962a72, - 0xa95c63971a19bfa7, - ]), -]; - -/// Round constants for width-12 Poseidon2 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 -/// -/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`. -/// -/// Layout: external_final (4 rounds × 12 elements). -pub const GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL: [[Goldilocks; 12]; 4] = [ - Goldilocks::new_array([ - 0xc68be7c94882a24d, - 0xaf996d5d5cdaedd9, - 0x9717f025e7daf6a5, - 0x6436679e6e7216f4, - 0x8a223d99047af267, - 0xbb512e35a133ba9a, - 0xfbbf44097671aa03, - 0xf04058ebf6811e61, - 0x5cca84703fac7ffb, - 0x9b55c7945de6469f, - 0x8e05bf09808e934f, - 0x2ea900de876307d7, - ]), - Goldilocks::new_array([ - 0x7748fff2b38dfb89, - 0x6b99a676dd3b5d81, - 0xac4bb7c627cf7c13, - 0xadb6ebe5e9e2f5ba, - 0x2d33378cafa24ae3, - 0x1e5b73807543f8c2, - 0x09208814bfebb10f, - 0x782e64b6bb5b93dd, - 0xadd5a48eac90b50f, - 0xadd4c54c736ea4b1, - 0xd58dbb86ed817fd8, - 0x6d5ed1a533f34ddd, - ]), - Goldilocks::new_array([ - 0x28686aa3e36b7cb9, - 0x591abd3476689f36, - 0x047d766678f13875, - 0xa2a11112625f5b49, - 0x21fd10a3f8304958, - 0xf9b40711443b0280, - 0xd2697eb8b2bde88e, - 0x3493790b51731b3f, - 0x11caf9dd73764023, - 0x7acfb8f72878164e, - 0x744ec4db23cefc26, - 0x1e00e58f422c6340, - ]), - Goldilocks::new_array([ - 0x21dd28d906a62dda, - 0xf32a46ab5f465b5f, - 0xbfce13201f3f7e6b, - 0xf30d2e7adb5304e2, - 0xecdf4ee4abad48e9, - 0xf94e82182d395019, - 0x4ee52e3744d887c5, - 0xa1341c7cac0083b2, - 0x2302fb26c30c834a, - 0xaea3c587273bf7d3, - 0xf798e24961823ec7, - 0x962deba3e9a2cd94, - ]), -]; - -/// Round constants for width-12 Poseidon2 on Goldilocks. -/// -/// Generated by the Grain LFSR with parameters: -/// field_type=1, alpha=7 (exp_flag=0), n=64, t=12, R_F=8, R_P=22 -/// -/// Generated by `poseidon2/generate_constants.py --field goldilocks --width 12`. -/// -/// Layout: internal (22 scalar constants). -pub const GOLDILOCKS_POSEIDON2_RC_12_INTERNAL: [Goldilocks; 22] = Goldilocks::new_array([ - 0x4adf842aa75d4316, - 0xf8fbb871aa4ab4eb, - 0x68e85b6eb2dd6aeb, - 0x07a0b06b2d270380, - 0xd94e0228bd282de4, - 0x8bdd91d3250c5278, - 0x209c68b88bba778f, - 0xb5e18cdab77f3877, - 0xb296a3e808da93fa, - 0x8370ecbda11a327e, - 0x3f9075283775dad8, - 0xb78095bb23c6aa84, - 0x3f36b9fe72ad4e5f, - 0x69bc96780b10b553, - 0x3f1d341f2eb7b881, - 0x4e939e9815838818, - 0xda366b3ae2a31604, - 0xbc89db1e7287d509, - 0x6102f411f9ef5659, - 0x58725c5e7ac1f0ab, - 0x0df5856c798883e7, - 0xf7bb62a8da4c961b, -]); - -/// Create a default width-8 Poseidon2 permutation for Goldilocks. -#[cfg(not(target_arch = "aarch64"))] -pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> { - Poseidon2::new( - ExternalLayerConstants::new( - GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(), - GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(), - ), - GOLDILOCKS_POSEIDON2_RC_8_INTERNAL.to_vec(), - ) -} - -/// Create a default width-8 Poseidon2 permutation for Goldilocks. -#[cfg(target_arch = "aarch64")] -pub fn default_goldilocks_poseidon2_8() -> Poseidon2Goldilocks<8> { - crate::Poseidon2GoldilocksFused::new( - &ExternalLayerConstants::new( - GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(), - GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(), - ), - &GOLDILOCKS_POSEIDON2_RC_8_INTERNAL, - ) -} - -/// Create a default width-12 Poseidon2 permutation for Goldilocks. -#[cfg(not(target_arch = "aarch64"))] -pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> { - Poseidon2::new( - ExternalLayerConstants::new( - GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(), - GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(), - ), - GOLDILOCKS_POSEIDON2_RC_12_INTERNAL.to_vec(), - ) -} - -/// Create a default width-12 Poseidon2 permutation for Goldilocks. -#[cfg(target_arch = "aarch64")] -pub fn default_goldilocks_poseidon2_12() -> Poseidon2Goldilocks<12> { - crate::Poseidon2GoldilocksFused::new( - &ExternalLayerConstants::new( - GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_INITIAL.to_vec(), - GOLDILOCKS_POSEIDON2_RC_12_EXTERNAL_FINAL.to_vec(), - ), - &GOLDILOCKS_POSEIDON2_RC_12_INTERNAL, - ) -} - -pub const MATRIX_DIAG_8_GOLDILOCKS: [Goldilocks; 8] = Goldilocks::new_array([ - 0xfffffffeffffffff, // -2 - 0x0000000000000001, // 1 - 0x0000000000000002, // 2 - 0x7fffffff80000001, // 1/2 - 0x0000000000000003, // 3 - 0x7fffffff80000000, // -1/2 - 0xfffffffefffffffe, // -3 - 0xfffffffefffffffd, // -4 -]); - -pub const MATRIX_DIAG_12_GOLDILOCKS: [Goldilocks; 12] = Goldilocks::new_array([ - 0xfffffffeffffffff, // -2 - 0x0000000000000001, // 1 - 0x0000000000000002, // 2 - 0x7fffffff80000001, // 1/2 - 0x0000000000000003, // 3 - 0x0000000000000004, // 4 - 0x7fffffff80000000, // -1/2 - 0xfffffffefffffffe, // -3 - 0xfffffffefffffffd, // -4 - 0xbfffffff40000001, // 1/2^2 - 0x3fffffffc0000000, // -1/2^2 - 0xdfffffff20000001, // 1/2^3 -]); - -pub const MATRIX_DIAG_16_GOLDILOCKS: [Goldilocks; 16] = Goldilocks::new_array([ - 0xfffffffeffffffff, // -2 - 0x0000000000000001, // 1 - 0x0000000000000002, // 2 - 0x7fffffff80000001, // 1/2 - 0x0000000000000003, // 3 - 0x0000000000000004, // 4 - 0x7fffffff80000000, // -1/2 - 0xfffffffefffffffe, // -3 - 0xfffffffefffffffd, // -4 - 0xdfffffff20000001, // 1/2^3 - 0xefffffff10000001, // 1/2^4 - 0xf7ffffff08000001, // 1/2^5 - 0x1fffffffe0000000, // -1/2^3 - 0x0ffffffff0000000, // -1/2^4 - 0x07fffffff8000000, // -1/2^5 - 0xfffffffe00000002, // 1/2^32 -]); - -pub const MATRIX_DIAG_20_GOLDILOCKS: [Goldilocks; 20] = Goldilocks::new_array([ - 0x95c381fda3b1fa57, - 0xf36fe9eb1288f42c, - 0x89f5dcdfef277944, - 0x106f22eadeb3e2d2, - 0x684e31a2530e5111, - 0x27435c5d89fd148e, - 0x3ebed31c414dbf17, - 0xfd45b0b2d294e3cc, - 0x48c904473a7f6dbf, - 0xe0d1b67809295b4d, - 0xddd1941e9d199dcb, - 0x8cfe534eeb742219, - 0xa6e5261d9e3b8524, - 0x6897ee5ed0f82c1b, - 0x0e7dcd0739ee5f78, - 0x493253f3d0d32363, - 0xbb2737f5845f05c0, - 0xa187e810b06ad903, - 0xb635b995936c4918, - 0x0b3694a940bd2394, -]); - -fn internal_layer_mat_mul_goldilocks_8>(state: &mut [A; 8]) { - let sum: A = state.iter().map(|r| r.dup()).sum(); - - let s0 = state[0].dup(); - let s1 = state[1].dup(); - let s2 = state[2].dup(); - let s3 = state[3].dup(); - let s4 = state[4].dup(); - let s5 = state[5].dup(); - let s6 = state[6].dup(); - let s7 = state[7].dup(); - - // V[0] = -2 - let two_s0 = s0.dup() + s0; - state[0] = sum.dup() - two_s0; - - // V[1] = 1 - state[1] = sum.dup() + s1; - - // V[2] = 2 - let two_s2 = s2.dup() + s2; - state[2] = sum.dup() + two_s2; - - // V[3] = 1/2 - state[3] = sum.dup() + s3.halve(); - - // V[4] = 3 - let two_s4 = s4.dup() + s4.dup(); - let three_s4 = two_s4 + s4; - state[4] = sum.dup() + three_s4; - - // V[5] = -1/2 - state[5] = sum.dup() - s5.halve(); - - // V[6] = -3 - let two_s6 = s6.dup() + s6.dup(); - let three_s6 = two_s6 + s6; - state[6] = sum.dup() - three_s6; - - // V[7] = -4 - let two_s7 = s7.dup() + s7; - let four_s7 = two_s7.dup() + two_s7; - state[7] = sum - four_s7; -} - -fn internal_layer_mat_mul_goldilocks_12>(state: &mut [A; 12]) { - let sum: A = state.iter().map(|r| r.dup()).sum(); - - let s0 = state[0].dup(); - let s1 = state[1].dup(); - let s2 = state[2].dup(); - let s3 = state[3].dup(); - let s4 = state[4].dup(); - let s5 = state[5].dup(); - let s6 = state[6].dup(); - let s7 = state[7].dup(); - let s8 = state[8].dup(); - let s9 = state[9].dup(); - let s10 = state[10].dup(); - let s11 = state[11].dup(); - - // V[0] = -2 - let two_s0 = s0.dup() + s0; - state[0] = sum.dup() - two_s0; - - // V[1] = 1 - state[1] = sum.dup() + s1; - - // V[2] = 2 - let two_s2 = s2.dup() + s2; - state[2] = sum.dup() + two_s2; - - // V[3] = 1/2 - state[3] = sum.dup() + s3.halve(); - - // V[4] = 3 - let two_s4 = s4.dup() + s4.dup(); - let three_s4 = two_s4 + s4; - state[4] = sum.dup() + three_s4; - - // V[5] = 4 - let two_s5 = s5.dup() + s5; - let four_s5 = two_s5.dup() + two_s5; - state[5] = sum.dup() + four_s5; - - // V[6] = -1/2 - state[6] = sum.dup() - s6.halve(); - - // V[7] = -3 - let two_s7 = s7.dup() + s7.dup(); - let three_s7 = two_s7 + s7; - state[7] = sum.dup() - three_s7; - - // V[8] = -4 - let two_s8 = s8.dup() + s8; - let four_s8 = two_s8.dup() + two_s8; - state[8] = sum.dup() - four_s8; - - // V[9] = 1/2^2 - state[9] = sum.dup() + s9.halve().halve(); - - // V[10] = -1/2^2 - state[10] = sum.dup() - s10.halve().halve(); - - // V[11] = 1/2^3 - state[11] = sum + s11.halve().halve().halve(); -} - -fn internal_layer_mat_mul_goldilocks_16>(state: &mut [A; 16]) { - let sum: A = state.iter().map(|r| r.dup()).sum(); - - let s0 = state[0].dup(); - let s1 = state[1].dup(); - let s2 = state[2].dup(); - let s3 = state[3].dup(); - let s4 = state[4].dup(); - let s5 = state[5].dup(); - let s6 = state[6].dup(); - let s7 = state[7].dup(); - let s8 = state[8].dup(); - let s9 = state[9].dup(); - let s10 = state[10].dup(); - let s11 = state[11].dup(); - let s12 = state[12].dup(); - let s13 = state[13].dup(); - let s14 = state[14].dup(); - let s15 = state[15].dup(); - - // V[0] = -2 - let two_s0 = s0.dup() + s0; - state[0] = sum.dup() - two_s0; - - // V[1] = 1 - state[1] = sum.dup() + s1; - - // V[2] = 2 - let two_s2 = s2.dup() + s2; - state[2] = sum.dup() + two_s2; - - // V[3] = 1/2 - state[3] = sum.dup() + s3.halve(); - - // V[4] = 3 - let two_s4 = s4.dup() + s4.dup(); - let three_s4 = two_s4 + s4; - state[4] = sum.dup() + three_s4; - - // V[5] = 4 - let two_s5 = s5.dup() + s5; - let four_s5 = two_s5.dup() + two_s5; - state[5] = sum.dup() + four_s5; - - // V[6] = -1/2 - state[6] = sum.dup() - s6.halve(); - - // V[7] = -3 - let two_s7 = s7.dup() + s7.dup(); - let three_s7 = two_s7 + s7; - state[7] = sum.dup() - three_s7; - - // V[8] = -4 - let two_s8 = s8.dup() + s8; - let four_s8 = two_s8.dup() + two_s8; - state[8] = sum.dup() - four_s8; - - // V[9] = 1/2^3 - state[9] = sum.dup() + s9.halve().halve().halve(); - - // V[10] = 1/2^4 - state[10] = sum.dup() + s10.halve().halve().halve().halve(); - - // V[11] = 1/2^5 - state[11] = sum.dup() + s11.halve().halve().halve().halve().halve(); - - // V[12] = -1/2^3 - state[12] = sum.dup() - s12.halve().halve().halve(); - - // V[13] = -1/2^4 - state[13] = sum.dup() - s13.halve().halve().halve().halve(); - - // V[14] = -1/2^5 - state[14] = sum.dup() - s14.halve().halve().halve().halve().halve(); - - // V[15] = 1/2^32 - let inv_2_32 = MATRIX_DIAG_16_GOLDILOCKS[15]; - let v15 = s15 * inv_2_32; - state[15] = sum + v15; -} - -/// The internal layers of the Poseidon2 permutation. -#[derive(Debug, Clone, Default)] -pub struct Poseidon2InternalLayerGoldilocks { - internal_constants: Vec, -} - -impl InternalLayerConstructor for Poseidon2InternalLayerGoldilocks { - fn new_from_constants(internal_constants: Vec) -> Self { - Self { internal_constants } - } -} - -impl + InjectiveMonomial> - InternalLayer for Poseidon2InternalLayerGoldilocks -{ - /// Perform the internal layers of the Poseidon2 permutation on the given state. - fn permute_state(&self, state: &mut [A; 8]) { - internal_permute_state( - state, - internal_layer_mat_mul_goldilocks_8, - &self.internal_constants, - ); - } -} - -impl + InjectiveMonomial> - InternalLayer for Poseidon2InternalLayerGoldilocks -{ - /// Perform the internal layers of the Poseidon2 permutation on the given state. - fn permute_state(&self, state: &mut [A; 12]) { - internal_permute_state( - state, - internal_layer_mat_mul_goldilocks_12, - &self.internal_constants, - ); - } -} - -impl + InjectiveMonomial> - InternalLayer for Poseidon2InternalLayerGoldilocks -{ - /// Perform the internal layers of the Poseidon2 permutation on the given state. - fn permute_state(&self, state: &mut [A; 16]) { - internal_permute_state( - state, - internal_layer_mat_mul_goldilocks_16, - &self.internal_constants, - ); - } -} - -impl + InjectiveMonomial> - InternalLayer for Poseidon2InternalLayerGoldilocks -{ - /// Perform the internal layers of the Poseidon2 permutation on the given state. - fn permute_state(&self, state: &mut [A; 20]) { - internal_permute_state( - state, - |x| matmul_internal(x, MATRIX_DIAG_20_GOLDILOCKS), - &self.internal_constants, - ); - } -} - -/// The external layers of the Poseidon2 permutation. -#[derive(Clone)] -pub struct Poseidon2ExternalLayerGoldilocks { - pub(crate) external_constants: ExternalLayerConstants, -} - -impl ExternalLayerConstructor - for Poseidon2ExternalLayerGoldilocks -{ - fn new_from_constants(external_constants: ExternalLayerConstants) -> Self { - Self { external_constants } - } -} - -impl + InjectiveMonomial, const WIDTH: usize> - ExternalLayer for Poseidon2ExternalLayerGoldilocks -{ - /// Perform the initial external layers of the Poseidon2 permutation on the given state. - fn permute_state_initial(&self, state: &mut [A; WIDTH]) { - external_initial_permute_state( - state, - self.external_constants.get_initial_constants(), - add_rc_and_sbox_generic, - &MDSMat4, - ); - } - - /// Perform the terminal external layers of the Poseidon2 permutation on the given state. - fn permute_state_terminal(&self, state: &mut [A; WIDTH]) { - external_terminal_permute_state( - state, - self.external_constants.get_terminal_constants(), - add_rc_and_sbox_generic, - &MDSMat4, - ); - } -} - -/// An implementation of the matrix multiplications in the internal and external layers of Poseidon2. -/// -/// This can act on `[A; WIDTH]` for any ring implementing `Algebra`. -/// If you have either `[Goldilocks::Packing; WIDTH]` or `[Goldilocks; WIDTH]` it will be much faster -/// to use `Poseidon2Goldilocks` instead of building a Poseidon2 permutation using this. -#[derive(Clone, Debug, Default)] -pub struct GenericPoseidon2LinearLayersGoldilocks; - -impl GenericPoseidon2LinearLayers<8> for GenericPoseidon2LinearLayersGoldilocks { - fn internal_linear_layer(state: &mut [R; 8]) { - let sum: R = state.iter().map(|r| r.dup()).sum(); - for i in 0..8 { - let d = R::from_u64(MATRIX_DIAG_8_GOLDILOCKS[i].value); - state[i] *= d; - state[i] += sum.dup(); - } - } -} - -impl GenericPoseidon2LinearLayers<12> for GenericPoseidon2LinearLayersGoldilocks { - fn internal_linear_layer(state: &mut [R; 12]) { - let sum: R = state.iter().map(|r| r.dup()).sum(); - for i in 0..12 { - let d = R::from_u64(MATRIX_DIAG_12_GOLDILOCKS[i].value); - state[i] *= d; - state[i] += sum.dup(); - } - } -} - -impl GenericPoseidon2LinearLayers<16> for GenericPoseidon2LinearLayersGoldilocks { - fn internal_linear_layer(state: &mut [R; 16]) { - let sum: R = state.iter().map(|r| r.dup()).sum(); - for i in 0..16 { - let d = R::from_u64(MATRIX_DIAG_16_GOLDILOCKS[i].value); - state[i] *= d; - state[i] += sum.dup(); - } - } -} - -impl GenericPoseidon2LinearLayers<20> for GenericPoseidon2LinearLayersGoldilocks { - fn internal_linear_layer(state: &mut [R; 20]) { - let sum: R = state.iter().map(|r| r.dup()).sum(); - for i in 0..20 { - let d = R::from_u64(MATRIX_DIAG_20_GOLDILOCKS[i].value); - state[i] *= d; - state[i] += sum.dup(); - } - } -} - -#[cfg(test)] -mod tests { - use p3_field::PrimeCharacteristicRing; - use p3_symmetric::Permutation; - - use super::*; - - type F = Goldilocks; - - #[test] - fn test_generic_internal_linear_layer_8_matches_matmul_internal() { - let mut state_generic = [ - F::from_u64(1), - F::from_u64(2), - F::from_u64(3), - F::from_u64(4), - F::from_u64(5), - F::from_u64(6), - F::from_u64(7), - F::from_u64(8), - ]; - let mut state_existing = state_generic; - - GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); - matmul_internal(&mut state_existing, MATRIX_DIAG_8_GOLDILOCKS); - - assert_eq!(state_generic, state_existing); - } - - #[test] - fn test_generic_internal_linear_layer_12_matches_matmul_internal() { - let mut state_generic = [ - F::from_u64(1), - F::from_u64(2), - F::from_u64(3), - F::from_u64(4), - F::from_u64(5), - F::from_u64(6), - F::from_u64(7), - F::from_u64(8), - F::from_u64(9), - F::from_u64(10), - F::from_u64(11), - F::from_u64(12), - ]; - let mut state_existing = state_generic; - - GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); - matmul_internal(&mut state_existing, MATRIX_DIAG_12_GOLDILOCKS); - - assert_eq!(state_generic, state_existing); - } - - #[test] - fn test_generic_internal_linear_layer_16_matches_matmul_internal() { - let mut state_generic = [ - F::from_u64(1), - F::from_u64(2), - F::from_u64(3), - F::from_u64(4), - F::from_u64(5), - F::from_u64(6), - F::from_u64(7), - F::from_u64(8), - F::from_u64(9), - F::from_u64(10), - F::from_u64(11), - F::from_u64(12), - F::from_u64(13), - F::from_u64(14), - F::from_u64(15), - F::from_u64(16), - ]; - let mut state_existing = state_generic; - - GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); - matmul_internal(&mut state_existing, MATRIX_DIAG_16_GOLDILOCKS); - - assert_eq!(state_generic, state_existing); - } - - #[test] - fn test_generic_internal_linear_layer_20_matches_matmul_internal() { - let mut state_generic = [ - F::from_u64(1), - F::from_u64(2), - F::from_u64(3), - F::from_u64(4), - F::from_u64(5), - F::from_u64(6), - F::from_u64(7), - F::from_u64(8), - F::from_u64(9), - F::from_u64(10), - F::from_u64(11), - F::from_u64(12), - F::from_u64(13), - F::from_u64(14), - F::from_u64(15), - F::from_u64(16), - F::from_u64(17), - F::from_u64(18), - F::from_u64(19), - F::from_u64(20), - ]; - let mut state_existing = state_generic; - - GenericPoseidon2LinearLayersGoldilocks::internal_linear_layer(&mut state_generic); - matmul_internal(&mut state_existing, MATRIX_DIAG_20_GOLDILOCKS); - - assert_eq!(state_generic, state_existing); - } - - #[test] - fn test_default_goldilocks_poseidon2_width_8() { - let mut input: [F; 8] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7]); - - let expected: [F; 8] = Goldilocks::new_array([ - 0x020cf04a1b214d14, - 0x84e14aaaeacaed25, - 0x1ae0f640e81c7457, - 0xa4d204cbaeb0d8a5, - 0x0cf637b627b3a7ff, - 0x788d304d948b486b, - 0x7327133ea1949af4, - 0xf415abb924da395b, - ]); - - let perm = default_goldilocks_poseidon2_8(); - perm.permute_mut(&mut input); - - assert_eq!(input, expected); - } - - #[test] - fn test_default_goldilocks_poseidon2_width_12() { - let mut input: [F; 12] = Goldilocks::new_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]); - - let expected: [F; 12] = Goldilocks::new_array([ - 0xf292ab67c0f14b03, - 0x0a32f1b37656544c, - 0x053c61ab895498de, - 0x02ff92e55b196ffb, - 0x58176e8f6f58cab2, - 0xb0aa1206e7aec0f8, - 0xe90c13f3dce83ca4, - 0xf4da15333edf39c2, - 0x23b701c053c2ca6c, - 0xd233d593dcdfbf58, - 0x4effa5f9516fb52e, - 0x0aaf4489f1f40166, - ]); - - let perm = default_goldilocks_poseidon2_12(); - perm.permute_mut(&mut input); - - assert_eq!(input, expected); - } -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs deleted file mode 100644 index 44fe4fa3f..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mds.rs +++ /dev/null @@ -1,86 +0,0 @@ -use p3_mds::MdsPermutation; -use p3_mds::util::apply_circulant; -use p3_symmetric::Permutation; - -use crate::x86_64_avx2::packing::PackedGoldilocksAVX2; -use crate::{ - MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW, - MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks, -}; -const fn convert_array(arr: [i64; N]) -> [u64; N] { - let mut result: [u64; N] = [0; N]; - let mut i = 0; - while i < N { - result[i] = arr[i] as u64; - i += 1; - } - result -} - -impl Permutation<[PackedGoldilocksAVX2; 8]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX2; 8]) -> [PackedGoldilocksAVX2; 8] { - const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW); - apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -impl Permutation<[PackedGoldilocksAVX2; 12]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX2; 12]) -> [PackedGoldilocksAVX2; 12] { - const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW); - apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -impl Permutation<[PackedGoldilocksAVX2; 16]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX2; 16]) -> [PackedGoldilocksAVX2; 16] { - const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW); - apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -impl Permutation<[PackedGoldilocksAVX2; 24]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX2; 24]) -> [PackedGoldilocksAVX2; 24] { - apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -#[cfg(test)] -mod tests { - use p3_symmetric::Permutation; - use rand::rngs::SmallRng; - use rand::{RngExt, SeedableRng}; - - use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX2}; - - macro_rules! test_avx2_mds { - ($name:ident, $width:literal) => { - #[test] - fn $name() { - let mut rng = SmallRng::seed_from_u64(1); - let mds = MdsMatrixGoldilocks; - - let input: [Goldilocks; $width] = rng.random(); - let expected = mds.permute(input); - - let packed_input = input.map(Into::::into); - let packed_output = mds.permute(packed_input); - - let avx2_output = packed_output.map(|x| x.0[0]); - assert_eq!(avx2_output, expected); - } - }; - } - - test_avx2_mds!(test_avx2_mds_width_8, 8); - test_avx2_mds!(test_avx2_mds_width_12, 12); - test_avx2_mds!(test_avx2_mds_width_16, 16); - test_avx2_mds!(test_avx2_mds_width_24, 24); -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs deleted file mode 100644 index 09300a20f..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod mds; -mod packing; -pub use packing::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs deleted file mode 100644 index 217a2b2e0..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx2/packing.rs +++ /dev/null @@ -1,539 +0,0 @@ -use alloc::vec::Vec; -use core::arch::x86_64::*; -use core::fmt::Debug; -use core::iter::{Product, Sum}; -use core::mem::transmute; -use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; - -use p3_field::exponentiation::exp_10540996611094048183; -use p3_field::interleave::{interleave_u64, interleave_u128}; -use p3_field::op_assign_macros::{ - impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, - impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, - ring_sum, -}; -use p3_field::{ - Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, - PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2, -}; -use p3_util::reconstitute_from_base; -use rand::distr::{Distribution, StandardUniform}; -use rand::{Rng, RngExt}; - -use crate::{Goldilocks, P}; - -const WIDTH: usize = 4; - -/// Vectorized AVX2 implementation of `Goldilocks` arithmetic. -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] -#[repr(transparent)] // Needed to make `transmute`s safe. -#[must_use] -pub struct PackedGoldilocksAVX2(pub [Goldilocks; WIDTH]); - -impl PackedGoldilocksAVX2 { - /// Get an arch-specific vector representing the packed values. - #[inline] - #[must_use] - pub(crate) fn to_vector(self) -> __m256i { - unsafe { - // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It - // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be - // transmuted to `__m256i`, since arrays are guaranteed to be contiguous in memory. - // Finally `PackedGoldilocksAVX2` is `repr(transparent)` so it can be transmuted to - // `[Goldilocks; WIDTH]`. - transmute(self) - } - } - - /// Make a packed field vector from an arch-specific vector. - /// - /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function - /// is safe unlike the `Mersenne31/MontyField31` variants. - #[inline] - pub(crate) fn from_vector(vector: __m256i) -> Self { - unsafe { - // Safety: `__m256i` can be transmuted to `[u64; WIDTH]` (since arrays elements are - // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since - // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to - // `PackedGoldilocksAVX2` (since `PackedGoldilocksAVX2` is also `repr(transparent)`). - transmute(vector) - } - } - - /// Copy `value` to all positions in a packed vector. This is the same as - /// `From::from`, but `const`. - #[inline] - const fn broadcast(value: Goldilocks) -> Self { - Self([value; WIDTH]) - } -} - -impl From for PackedGoldilocksAVX2 { - fn from(x: Goldilocks) -> Self { - Self::broadcast(x) - } -} - -impl Add for PackedGoldilocksAVX2 { - type Output = Self; - #[inline] - fn add(self, rhs: Self) -> Self { - Self::from_vector(add(self.to_vector(), rhs.to_vector())) - } -} - -impl Sub for PackedGoldilocksAVX2 { - type Output = Self; - #[inline] - fn sub(self, rhs: Self) -> Self { - Self::from_vector(sub(self.to_vector(), rhs.to_vector())) - } -} - -impl Neg for PackedGoldilocksAVX2 { - type Output = Self; - #[inline] - fn neg(self) -> Self { - Self::from_vector(neg(self.to_vector())) - } -} - -impl Mul for PackedGoldilocksAVX2 { - type Output = Self; - #[inline] - fn mul(self, rhs: Self) -> Self { - Self::from_vector(mul(self.to_vector(), rhs.to_vector())) - } -} - -impl_add_assign!(PackedGoldilocksAVX2); -impl_sub_assign!(PackedGoldilocksAVX2); -impl_mul_methods!(PackedGoldilocksAVX2); -ring_sum!(PackedGoldilocksAVX2); -impl_rng!(PackedGoldilocksAVX2); - -impl PrimeCharacteristicRing for PackedGoldilocksAVX2 { - type PrimeSubfield = Goldilocks; - - const ZERO: Self = Self::broadcast(Goldilocks::ZERO); - const ONE: Self = Self::broadcast(Goldilocks::ONE); - const TWO: Self = Self::broadcast(Goldilocks::TWO); - const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE); - - #[inline] - fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { - f.into() - } - - #[inline] - fn halve(&self) -> Self { - Self::from_vector(halve(self.to_vector())) - } - - #[inline] - fn square(&self) -> Self { - Self::from_vector(square(self.to_vector())) - } - - #[inline] - fn zero_vec(len: usize) -> Vec { - // SAFETY: this is a repr(transparent) wrapper around an array. - unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) } - } -} - -// Degree of the smallest permutation polynomial for Goldilocks. -// -// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7. -impl InjectiveMonomial<7> for PackedGoldilocksAVX2 {} - -impl PermutationMonomial<7> for PackedGoldilocksAVX2 { - /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}. - /// - /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`. - fn injective_exp_root_n(&self) -> Self { - exp_10540996611094048183(*self) - } -} - -impl_add_base_field!(PackedGoldilocksAVX2, Goldilocks); -impl_sub_base_field!(PackedGoldilocksAVX2, Goldilocks); -impl_mul_base_field!(PackedGoldilocksAVX2, Goldilocks); -impl_div_methods!(PackedGoldilocksAVX2, Goldilocks); -impl_sum_prod_base_field!(PackedGoldilocksAVX2, Goldilocks); - -impl Algebra for PackedGoldilocksAVX2 { - // Benchmarked on AVX2: chunk=32 ≈ 226ns, chunk=2 ≈ 228ns, chunk=16 ≈ 229ns. - const BATCHED_LC_CHUNK: usize = 32; -} - -impl_packed_value!(PackedGoldilocksAVX2, Goldilocks, WIDTH); - -unsafe impl PackedField for PackedGoldilocksAVX2 { - type Scalar = Goldilocks; -} - -impl_packed_field_pow_2!( - PackedGoldilocksAVX2; - [ - (1, interleave_u64), - (2, interleave_u128), - ], - WIDTH -); - -// Resources: -// 1. Intel Intrinsics Guide for explanation of each intrinsic: -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/ -// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html -// 3. Intel optimization manual for introduction to x86 vector extensions and best practices: -// https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html - -// Preliminary knowledge: -// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with -// _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a -// vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper. -// -// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily -// emulated. The method recognizes that for a + b overflowed iff (a + b) < a: -// i. res_lo = a_lo + b_lo -// ii. carry_mask = res_lo < a_lo -// iii. res_hi = a_hi + b_hi - carry_mask -// Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions -// return -1 (all bits 1) for true and 0 for false. -// -// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons -// by recognizing that a __m256i { - unsafe { _mm256_xor_si256(x, SIGN_BIT) } -} - -/// Convert to canonical representation. -/// The argument is assumed to be shifted by 1 << 63 (i.e. x_s = x + 1<<63, where x is the field -/// value). The returned value is similarly shifted by 1 << 63 (i.e. we return y_s = y + (1<<63), -/// where 0 <= y < FIELD_ORDER). -#[inline] -unsafe fn canonicalize_s(x_s: __m256i) -> __m256i { - unsafe { - // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1. - let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s); - // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0. - let wrapback_amt = _mm256_andnot_si256(mask, EPSILON); - _mm256_add_epi64(x_s, wrapback_amt) - } -} - -/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is -/// pre-shifted by 1 << 63. The result is similarly shifted. -#[inline] -unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i { - unsafe { - let res_wrapped_s = _mm256_add_epi64(x, y_s); - let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0. - let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. - _mm256_add_epi64(res_wrapped_s, wrapback_amt) - } -} - -/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`. -/// -/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn add(x: __m256i, y: __m256i) -> __m256i { - unsafe { - let y_s = shift(y); - let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s)); - shift(res_s) - } -} - -/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`. -/// -/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn sub(x: __m256i, y: __m256i) -> __m256i { - unsafe { - let mut y_s = shift(y); - y_s = canonicalize_s(y_s); - let x_s = shift(x); - let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0. - let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0. - let res_wrapped = _mm256_sub_epi64(x_s, y_s); - _mm256_sub_epi64(res_wrapped, wrapback_amt) - } -} - -/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`. -/// -/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn neg(y: __m256i) -> __m256i { - unsafe { - let y_s = shift(y); - _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s)) - } -} - -/// Halve a vector of Goldilocks field elements. -#[inline(always)] -pub(crate) fn halve(input: __m256i) -> __m256i { - /* - We want this to compile to: - vpand least_bit, val, ONE - vpsrlq t, val, 1 - vpsubq neg_least_bit, ZERO, least_bit - vpand maybe_half, HALF, neg_least_bit - vpaddq res, t, maybe_half - throughput: 1.67 cyc/vec - latency: 4 cyc - - Given an element val in [0, P), we want to compute val/2 mod P. - If val is even: val/2 mod P = val/2 = val >> 1. - If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2 - */ - unsafe { - // Safety: If this code got compiled then AVX2 intrinsics are available. - const ONE: __m256i = unsafe { transmute([1_i64; 4]) }; - const ZERO: __m256i = unsafe { transmute([0_i64; 4]) }; - let half = _mm256_set1_epi64x(P.div_ceil(2) as i64); // Compiler should realise this is constant. - - let least_bit = _mm256_and_si256(input, ONE); // Determine the parity of val. - let t = _mm256_srli_epi64::<1>(input); - - // Negate the least bit giving us either 0 (all bits 0) or -1 (all bits 1). - // It would be better to use vpsignq but this instruction does not exist. - let neg_least_bit = _mm256_sub_epi64(ZERO, least_bit); - - // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0 - let maybe_half = _mm256_and_si256(half, neg_least_bit); - _mm256_add_epi64(t, maybe_half) - } -} - -/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the -/// scalar instruction, but may be worth it if we want our data to live in vector registers. -#[inline] -fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) { - unsafe { - // We want to move the high 32 bits to the low position. The multiplication instruction ignores - // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can - // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication. - // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the - // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. - // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency - // since Haswell. - let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); - let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y))); - - // All four pairwise multiplications - let mul_ll = _mm256_mul_epu32(x, y); - let mul_lh = _mm256_mul_epu32(x, y_hi); - let mul_hl = _mm256_mul_epu32(x_hi, y); - let mul_hh = _mm256_mul_epu32(x_hi, y_hi); - - // Bignum addition - // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. - let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll); - let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi); - // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. - // Also, extract high 32 bits of t0 and add to mul_hh. - let t0_lo = _mm256_and_si256(t0, EPSILON); - let t0_hi = _mm256_srli_epi64::<32>(t0); - let t1 = _mm256_add_epi64(mul_lh, t0_lo); - let t2 = _mm256_add_epi64(mul_hh, t0_hi); - // Lastly, extract the high 32 bits of t1 and add to t2. - let t1_hi = _mm256_srli_epi64::<32>(t1); - let res_hi = _mm256_add_epi64(t2, t1_hi); - - // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high - // position). - let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1))); - let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo); - - (res_hi, res_lo) - } -} - -/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction. -#[inline] -fn square64(x: __m256i) -> (__m256i, __m256i) { - unsafe { - // Get high 32 bits of x. See comment in mul64_64_s. - let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); - - // All pairwise multiplications. - let mul_ll = _mm256_mul_epu32(x, x); - let mul_lh = _mm256_mul_epu32(x, x_hi); - let mul_hh = _mm256_mul_epu32(x_hi, x_hi); - - // Bignum addition, but mul_lh is shifted by 33 bits (not 32). - let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll); - let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi); - let t0_hi = _mm256_srli_epi64::<31>(t0); - let res_hi = _mm256_add_epi64(mul_hh, t0_hi); - - // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high - // position). - let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh); - let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo); - - (res_hi, res_lo) - } -} - -/// Goldilocks addition of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be -/// `<= 2^64 - 2^32 = 0xffffffff00000000`. The result is shifted by 2**63. -#[inline] -unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { - unsafe { - let res_wrapped_s = _mm256_add_epi64(x_s, y); - // 32-bit compare is faster than 64-bit. It's safe as long as x > res_wrapped iff x >> 32 > - // res_wrapped >> 32. The case of x >> 32 > res_wrapped >> 32 is trivial and so is <. The case - // where x >> 32 = res_wrapped >> 32 remains. If x >> 32 = res_wrapped >> 32, then y >> 32 = - // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y - // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur. - let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0. - // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise. - let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. - _mm256_add_epi64(res_wrapped_s, wrapback_amt) - } -} - -/// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be -/// <= `0xffffffff00000000`. The result is shifted by 2**63. -#[inline] -unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { - unsafe { - let res_wrapped_s = _mm256_sub_epi64(x_s, y); - // 32-bit compare is faster than 64-bit. It's safe as long as res_wrapped > x iff res_wrapped >> - // 32 > x >> 32. The case of res_wrapped >> 32 > x >> 32 is trivial and so is <. The case where - // res_wrapped >> 32 = x >> 32 remains. If res_wrapped >> 32 = x >> 32, then y >> 32 = - // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if - // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur. - let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0. - // The mask contains 0xffffffff in the high 32 bits if wraparound occurred and 0 otherwise. - let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0. - _mm256_sub_epi64(res_wrapped_s, wrapback_amt) - } -} - -/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order. -/// -/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`. -#[inline] -fn reduce128(x: (__m256i, __m256i)) -> __m256i { - unsafe { - let (hi0, lo0) = x; - - // First we shift lo0 to lo0_s = lo0 + 2^{63} mod 2^64 - // This lets us emulate unsigned comparisons - let lo0_s = shift(lo0); - - // Get the top 32 bits of hi_hi0. - let hi_hi0 = _mm256_srli_epi64::<32>(hi0); - - // Computes lo0_s - hi_hi0 mod FIELD_ORDER. - // Makes sense to do as 2^96 = -1 mod FIELD_ORDER. - // sub_small_64s_64_s is safe to use as `hi_hi0 < 2^32`. - let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0); - - // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER - // _mm256_mul_epu32 ignores the top 32 bits so just use that. - let t1 = _mm256_mul_epu32(hi0, EPSILON); - - // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 so we can use `add_small_64s_64_s` to get - // `lo2_s = lo1_s + t1 mod FIELD_ORDER.` - let lo2_s = add_small_64s_64_s(lo1_s, t1); - - // Finally just need to correct for the shift. - shift(lo2_s) - } -} - -/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`. -/// -/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn mul(x: __m256i, y: __m256i) -> __m256i { - reduce128(mul64_64(x, y)) -} - -/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`. -/// -/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn square(x: __m256i) -> __m256i { - reduce128(square64(x)) -} - -#[cfg(test)] -mod tests { - use p3_field_testing::test_packed_field; - - use super::{Goldilocks, PackedGoldilocksAVX2, WIDTH}; - - const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([ - 0xFFFF_FFFF_0000_0000, - 0xFFFF_FFFF_FFFF_FFFF, - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0001, - ]); - - const ZEROS: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([ - 0x0000_0000_0000_0000, - 0xFFFF_FFFF_0000_0001, - 0x0000_0000_0000_0000, - 0xFFFF_FFFF_0000_0001, - ])); - - const ONES: PackedGoldilocksAVX2 = PackedGoldilocksAVX2(Goldilocks::new_array([ - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0002, - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0002, - ])); - - test_packed_field!( - crate::PackedGoldilocksAVX2, - &[super::ZEROS], - &[super::ONES], - crate::PackedGoldilocksAVX2(super::SPECIAL_VALS) - ); -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs deleted file mode 100644 index f4d6c9f71..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mds.rs +++ /dev/null @@ -1,86 +0,0 @@ -use p3_mds::MdsPermutation; -use p3_mds::util::apply_circulant; -use p3_symmetric::Permutation; - -use crate::x86_64_avx512::packing::PackedGoldilocksAVX512; -use crate::{ - MATRIX_CIRC_MDS_8_SML_ROW, MATRIX_CIRC_MDS_12_SML_ROW, MATRIX_CIRC_MDS_16_SML_ROW, - MATRIX_CIRC_MDS_24_GOLDILOCKS, MdsMatrixGoldilocks, -}; -const fn convert_array(arr: [i64; N]) -> [u64; N] { - let mut result: [u64; N] = [0; N]; - let mut i = 0; - while i < N { - result[i] = arr[i] as u64; - i += 1; - } - result -} - -impl Permutation<[PackedGoldilocksAVX512; 8]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX512; 8]) -> [PackedGoldilocksAVX512; 8] { - const MATRIX_CIRC_MDS_8_SML_ROW_U64: [u64; 8] = convert_array(MATRIX_CIRC_MDS_8_SML_ROW); - apply_circulant(&MATRIX_CIRC_MDS_8_SML_ROW_U64, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -impl Permutation<[PackedGoldilocksAVX512; 12]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX512; 12]) -> [PackedGoldilocksAVX512; 12] { - const MATRIX_CIRC_MDS_12_SML_ROW_U64: [u64; 12] = convert_array(MATRIX_CIRC_MDS_12_SML_ROW); - apply_circulant(&MATRIX_CIRC_MDS_12_SML_ROW_U64, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -impl Permutation<[PackedGoldilocksAVX512; 16]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX512; 16]) -> [PackedGoldilocksAVX512; 16] { - const MATRIX_CIRC_MDS_16_SML_ROW_U64: [u64; 16] = convert_array(MATRIX_CIRC_MDS_16_SML_ROW); - apply_circulant(&MATRIX_CIRC_MDS_16_SML_ROW_U64, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -impl Permutation<[PackedGoldilocksAVX512; 24]> for MdsMatrixGoldilocks { - fn permute(&self, input: [PackedGoldilocksAVX512; 24]) -> [PackedGoldilocksAVX512; 24] { - apply_circulant(&MATRIX_CIRC_MDS_24_GOLDILOCKS, &input) - } -} - -impl MdsPermutation for MdsMatrixGoldilocks {} - -#[cfg(test)] -mod tests { - use p3_symmetric::Permutation; - use rand::rngs::SmallRng; - use rand::{RngExt, SeedableRng}; - - use crate::{Goldilocks, MdsMatrixGoldilocks, PackedGoldilocksAVX512}; - - macro_rules! test_avx512_mds { - ($name:ident, $width:literal) => { - #[test] - fn $name() { - let mut rng = SmallRng::seed_from_u64(1); - let mds = MdsMatrixGoldilocks; - - let input: [Goldilocks; $width] = rng.random(); - let expected = mds.permute(input); - - let packed_input = input.map(Into::::into); - let packed_output = mds.permute(packed_input); - - let avx512_output = packed_output.map(|x| x.0[0]); - assert_eq!(avx512_output, expected); - } - }; - } - - test_avx512_mds!(test_avx512_mds_width_8, 8); - test_avx512_mds!(test_avx512_mds_width_12, 12); - test_avx512_mds!(test_avx512_mds_width_16, 16); - test_avx512_mds!(test_avx512_mds_width_24, 24); -} diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs deleted file mode 100644 index 09300a20f..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod mds; -mod packing; -pub use packing::*; diff --git a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs b/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs deleted file mode 100644 index 0c751b436..000000000 --- a/bench_vs_plonky3/p3-goldilocks-patched/src/x86_64_avx512/packing.rs +++ /dev/null @@ -1,444 +0,0 @@ -use alloc::vec::Vec; -use core::arch::x86_64::*; -use core::fmt::Debug; -use core::iter::{Product, Sum}; -use core::mem::transmute; -use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; - -use p3_field::exponentiation::exp_10540996611094048183; -use p3_field::interleave::{interleave_u64, interleave_u128, interleave_u256}; -use p3_field::op_assign_macros::{ - impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, - impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, - ring_sum, -}; -use p3_field::{ - Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, - PermutationMonomial, PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2, -}; -use p3_util::reconstitute_from_base; -use rand::distr::{Distribution, StandardUniform}; -use rand::{Rng, RngExt}; - -use crate::{Goldilocks, P}; - -const WIDTH: usize = 8; - -/// Vectorized AVX512 implementation of `Goldilocks` arithmetic. -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] -#[repr(transparent)] // Needed to make `transmute`s safe. -#[must_use] -pub struct PackedGoldilocksAVX512(pub [Goldilocks; WIDTH]); - -impl PackedGoldilocksAVX512 { - /// Get an arch-specific vector representing the packed values. - #[inline] - #[must_use] - pub(crate) fn to_vector(self) -> __m512i { - unsafe { - // Safety: `Goldilocks` is `repr(transparent)` so it can be transmuted to `u64`. It - // follows that `[Goldilocks; WIDTH]` can be transmuted to `[u64; WIDTH]`, which can be - // transmuted to `__m512i`, since arrays are guaranteed to be contiguous in memory. - // Finally `PackedGoldilocksAVX512` is `repr(transparent)` so it can be transmuted to - // `[Goldilocks; WIDTH]`. - transmute(self) - } - } - - /// Make a packed field vector from an arch-specific vector. - /// - /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function - /// is safe unlike the `Mersenne31/MontyField31` variants. - #[inline] - pub(crate) fn from_vector(vector: __m512i) -> Self { - unsafe { - // Safety: `__m512i` can be transmuted to `[u64; WIDTH]` (since arrays elements are - // contiguous in memory), which can be transmuted to `[Goldilocks; WIDTH]` (since - // `Goldilocks` is `repr(transparent)`), which in turn can be transmuted to - // `PackedGoldilocksAVX512` (since `PackedGoldilocksAVX512` is also `repr(transparent)`). - transmute(vector) - } - } - - /// Copy `value` to all positions in a packed vector. This is the same as - /// `From::from`, but `const`. - #[inline] - const fn broadcast(value: Goldilocks) -> Self { - Self([value; WIDTH]) - } -} - -impl From for PackedGoldilocksAVX512 { - fn from(x: Goldilocks) -> Self { - Self::broadcast(x) - } -} - -impl Add for PackedGoldilocksAVX512 { - type Output = Self; - #[inline] - fn add(self, rhs: Self) -> Self { - Self::from_vector(add(self.to_vector(), rhs.to_vector())) - } -} - -impl Sub for PackedGoldilocksAVX512 { - type Output = Self; - #[inline] - fn sub(self, rhs: Self) -> Self { - Self::from_vector(sub(self.to_vector(), rhs.to_vector())) - } -} - -impl Neg for PackedGoldilocksAVX512 { - type Output = Self; - #[inline] - fn neg(self) -> Self { - Self::from_vector(neg(self.to_vector())) - } -} - -impl Mul for PackedGoldilocksAVX512 { - type Output = Self; - #[inline] - fn mul(self, rhs: Self) -> Self { - Self::from_vector(mul(self.to_vector(), rhs.to_vector())) - } -} - -impl_add_assign!(PackedGoldilocksAVX512); -impl_sub_assign!(PackedGoldilocksAVX512); -impl_mul_methods!(PackedGoldilocksAVX512); -ring_sum!(PackedGoldilocksAVX512); -impl_rng!(PackedGoldilocksAVX512); - -impl PrimeCharacteristicRing for PackedGoldilocksAVX512 { - type PrimeSubfield = Goldilocks; - - const ZERO: Self = Self::broadcast(Goldilocks::ZERO); - const ONE: Self = Self::broadcast(Goldilocks::ONE); - const TWO: Self = Self::broadcast(Goldilocks::TWO); - const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE); - - #[inline] - fn from_prime_subfield(f: Self::PrimeSubfield) -> Self { - f.into() - } - - #[inline] - fn halve(&self) -> Self { - Self::from_vector(halve(self.to_vector())) - } - - #[inline] - fn square(&self) -> Self { - Self::from_vector(square(self.to_vector())) - } - - #[inline] - fn zero_vec(len: usize) -> Vec { - // SAFETY: this is a repr(transparent) wrapper around an array. - unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) } - } -} - -impl_add_base_field!(PackedGoldilocksAVX512, Goldilocks); -impl_sub_base_field!(PackedGoldilocksAVX512, Goldilocks); -impl_mul_base_field!(PackedGoldilocksAVX512, Goldilocks); -impl_div_methods!(PackedGoldilocksAVX512, Goldilocks); -impl_sum_prod_base_field!(PackedGoldilocksAVX512, Goldilocks); - -impl Algebra for PackedGoldilocksAVX512 { - // Benchmarked on AVX-512: chunk=4 ≈ 198ns, chunk=2 ≈ 198ns, chunk=32 ≈ 199ns. - const BATCHED_LC_CHUNK: usize = 4; -} - -// Degree of the smallest permutation polynomial for Goldilocks. -// -// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7. -impl InjectiveMonomial<7> for PackedGoldilocksAVX512 {} - -impl PermutationMonomial<7> for PackedGoldilocksAVX512 { - /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}. - /// - /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`. - fn injective_exp_root_n(&self) -> Self { - exp_10540996611094048183(*self) - } -} - -impl_packed_value!(PackedGoldilocksAVX512, Goldilocks, WIDTH); - -unsafe impl PackedField for PackedGoldilocksAVX512 { - type Scalar = Goldilocks; -} - -impl_packed_field_pow_2!( - PackedGoldilocksAVX512; - [ - (1, interleave_u64), - (2, interleave_u128), - (4, interleave_u256), - ], - WIDTH -); - -const FIELD_ORDER: __m512i = unsafe { transmute([Goldilocks::ORDER_U64; WIDTH]) }; -const EPSILON: __m512i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) }; - -#[inline] -unsafe fn canonicalize(x: __m512i) -> __m512i { - unsafe { - let mask = _mm512_cmpge_epu64_mask(x, FIELD_ORDER); - _mm512_mask_sub_epi64(x, mask, x, FIELD_ORDER) - } -} - -/// Compute the modular addition `x + y mod FIELD_ORDER`. -/// -/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider -/// set of circumstances if bounds on `x` are known. -/// -/// The result will be a u64 which may be greater than FIELD_ORDER. -/// -/// Safety: -/// User must ensure that x + y < 2^64 + FIELD_ORDER. -#[inline] -unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i { - unsafe { - let res_wrapped = _mm512_add_epi64(x, y); - let mask = _mm512_cmplt_epu64_mask(res_wrapped, y); // mask set if add overflowed - _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER) - } -} - -/// Compute the modular subtraction x - y mod FIELD_ORDER. -/// -/// This function is always safe if `y < FIELD_ORDER` but may also be used in a wider -/// set of circumstances if bounds on `x` are known. -/// -/// The result will be a u64 which may be greater than FIELD_ORDER. -/// -/// Safety: -/// User must ensure that x - y > -FIELD_ORDER. -#[inline] -unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i { - unsafe { - let mask = _mm512_cmplt_epu64_mask(x, y); // mask set if sub will underflow (x < y) - let res_wrapped = _mm512_sub_epi64(x, y); - _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER) - } -} - -/// Goldilocks modular addition. Computes `x + y mod FIELD_ORDER`. -/// -/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn add(x: __m512i, y: __m512i) -> __m512i { - unsafe { add_no_double_overflow_64_64(x, canonicalize(y)) } -} - -/// Goldilocks modular subtraction. Computes `x - y mod FIELD_ORDER`. -/// -/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn sub(x: __m512i, y: __m512i) -> __m512i { - unsafe { sub_no_double_overflow_64_64(x, canonicalize(y)) } -} - -/// Goldilocks modular negation. Computes `-x mod FIELD_ORDER`. -/// -/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn neg(y: __m512i) -> __m512i { - unsafe { _mm512_sub_epi64(FIELD_ORDER, canonicalize(y)) } -} - -/// Halve a vector of Goldilocks field elements. -#[inline(always)] -pub(crate) fn halve(input: __m512i) -> __m512i { - /* - We want this to compile to: - vptestmq least_bit, val, ONE - vpsrlq res, val, 1 - vpaddq res{least_bit}, res, maybe_half - throughput: 2 cyc/vec - latency: 4 cyc - - Given an element val in [0, P), we want to compute val/2 mod P. - If val is even: val/2 mod P = val/2 = val >> 1. - If val is odd: val/2 mod P = (val + P)/2 = (val >> 1) + (P + 1)/2 - */ - unsafe { - // Safety: If this code got compiled then AVX512 intrinsics are available. - const ONE: __m512i = unsafe { transmute([1_i64; 8]) }; - let half = _mm512_set1_epi64(P.div_ceil(2) as i64); // Compiler realises this is constant. - - let least_bit = _mm512_test_epi64_mask(input, ONE); // Determine the parity of val. - let t = _mm512_srli_epi64::<1>(input); - // This does nothing when least_bit = 1 and sets the corresponding entry to 0 when least_bit = 0 - _mm512_mask_add_epi64(t, least_bit, t, half) - } -} - -#[allow(clippy::useless_transmute)] -const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) }; - -/// Full 64-bit by 64-bit multiplication. -#[inline] -fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) { - unsafe { - // We want to move the high 32 bits to the low position. The multiplication instruction ignores - // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can - // be done on port 5; bitshifts run on port 0, competing with multiplication. - // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the - // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. - // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency - // since Haswell. - let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x))); - let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y))); - - // All four pairwise multiplications - let mul_ll = _mm512_mul_epu32(x, y); - let mul_lh = _mm512_mul_epu32(x, y_hi); - let mul_hl = _mm512_mul_epu32(x_hi, y); - let mul_hh = _mm512_mul_epu32(x_hi, y_hi); - - // Bignum addition - // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. - let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll); - let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi); - // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. - // Also, extract high 32 bits of t0 and add to mul_hh. - let t0_lo = _mm512_and_si512(t0, EPSILON); - let t0_hi = _mm512_srli_epi64::<32>(t0); - let t1 = _mm512_add_epi64(mul_lh, t0_lo); - let t2 = _mm512_add_epi64(mul_hh, t0_hi); - // Lastly, extract the high 32 bits of t1 and add to t2. - let t1_hi = _mm512_srli_epi64::<32>(t1); - let res_hi = _mm512_add_epi64(t2, t1_hi); - - // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high - // position). - let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1))); - let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll); - - (res_hi, res_lo) - } -} - -/// Full 64-bit squaring. -#[inline] -fn square64(x: __m512i) -> (__m512i, __m512i) { - unsafe { - // Get high 32 bits of x. See comment in mul64_64_s. - let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x))); - - // All pairwise multiplications. - let mul_ll = _mm512_mul_epu32(x, x); - let mul_lh = _mm512_mul_epu32(x, x_hi); - let mul_hh = _mm512_mul_epu32(x_hi, x_hi); - - // Bignum addition, but mul_lh is shifted by 33 bits (not 32). - let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll); - let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi); - let t0_hi = _mm512_srli_epi64::<31>(t0); - let res_hi = _mm512_add_epi64(mul_hh, t0_hi); - - // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high - // position). - let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh); - let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo); - - (res_hi, res_lo) - } -} - -/// Given a 128-bit value represented as two 64-bit halves, reduce it modulo the Goldilocks field order. -/// -/// The result will be a 64-bit value but may be larger than `FIELD_ORDER`. -#[inline] -fn reduce128(x: (__m512i, __m512i)) -> __m512i { - unsafe { - let (hi0, lo0) = x; - - // Find the high 32 bits of hi0. - let hi_hi0 = _mm512_srli_epi64::<32>(hi0); - - // Computes lo0_s - hi_hi0 mod FIELD_ORDER. - // Makes sense to do as 2^96 = -1 mod FIELD_ORDER. - // `sub_no_double_overflow_64_64` is safe to use as `hi_hi0 < 2^32`. - let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0); - - // Compute the product of the bottom 32 bits of hi0 with 2^64 = 2^32 - 1 mod FIELD_ORDER - // _mm256_mul_epu32 ignores the top 32 bits so just use that. - let t1 = _mm512_mul_epu32(hi0, EPSILON); - - // Clearly t1 <= (2^32 - 1)^2 = 2^64 - 2^33 + 1 < FIELD_ORDER so we can use `add_no_double_overflow_64_64` to get - // `lo1 + t1 mod FIELD_ORDER.` - add_no_double_overflow_64_64(lo1, t1) - } -} - -/// Goldilocks modular multiplication. Computes `x * y mod FIELD_ORDER`. -/// -/// Inputs can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn mul(x: __m512i, y: __m512i) -> __m512i { - reduce128(mul64_64(x, y)) -} - -/// Goldilocks modular square. Computes `x^2 mod FIELD_ORDER`. -/// -/// Input can be arbitrary, output is not guaranteed to be less than `FIELD_ORDER`. -#[inline] -fn square(x: __m512i) -> __m512i { - reduce128(square64(x)) -} - -#[cfg(test)] -mod tests { - use p3_field_testing::test_packed_field; - - use super::{Goldilocks, PackedGoldilocksAVX512, WIDTH}; - - const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([ - 0xFFFF_FFFF_0000_0001, - 0xFFFF_FFFF_0000_0000, - 0xFFFF_FFFE_FFFF_FFFF, - 0xFFFF_FFFF_FFFF_FFFF, - 0x0000_0000_0000_0000, - 0x0000_0000_0000_0001, - 0x0000_0000_0000_0002, - 0x0FFF_FFFF_F000_0000, - ]); - - const ZEROS: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([ - 0x0000_0000_0000_0000, - 0xFFFF_FFFF_0000_0001, - 0x0000_0000_0000_0000, - 0xFFFF_FFFF_0000_0001, - 0x0000_0000_0000_0000, - 0xFFFF_FFFF_0000_0001, - 0x0000_0000_0000_0000, - 0xFFFF_FFFF_0000_0001, - ])); - - const ONES: PackedGoldilocksAVX512 = PackedGoldilocksAVX512(Goldilocks::new_array([ - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0002, - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0002, - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0002, - 0x0000_0000_0000_0001, - 0xFFFF_FFFF_0000_0002, - ])); - - test_packed_field!( - crate::PackedGoldilocksAVX512, - &[super::ZEROS], - &[super::ONES], - crate::PackedGoldilocksAVX512(super::SPECIAL_VALS) - ); -} diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index a0ace698d..a23f0144d 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -5,15 +5,14 @@ # Usage: # ./bench_vs_plonky3/run.sh [--log-rows K ...] [--num-sequences N] [--runs N] # [--lambda-only | --p3-only] [--report-dir DIR] -# [--no-p3-patch] [--scalar] [--no-color] +# [--scalar] [--no-color] # # Defaults: --log-rows 19, --num-sequences 16, --runs 3. # With multiple --log-rows values, prints one median row per size. # -# --scalar: disables SIMD at the target-feature level. On x86_64 drops AVX2 -# and AVX-512 (Goldilocks + most of Keccak go scalar, residual SSE2 in -# p3-keccak). On aarch64 drops the SHA3 NEON extension. Triggers a rebuild -# when toggling; subsequent runs with the same RUSTFLAGS are cached. +# --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks (and most of Keccak) +# run scalar; residual SSE2 in p3-keccak remains. Triggers a rebuild when +# toggling; subsequent runs with the same RUSTFLAGS are cached. set -euo pipefail @@ -22,7 +21,6 @@ ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="/tmp/bench_p3" REPORT_DIR="" NO_COLOR=false -NO_P3_PATCH=false SCALAR=false RED='\033[0;31m' @@ -70,10 +68,6 @@ while [[ $# -gt 0 ]]; do REPORT_DIR=$2 shift 2 ;; - --no-p3-patch) - NO_P3_PATCH=true - shift - ;; --scalar) SCALAR=true shift @@ -122,78 +116,23 @@ if [ -n "$REPORT_DIR" ]; then mkdir -p "$REPORT_DIR/raw" fi -# --- Patch toggle ----------------------------------------------------------- -# The root Cargo.toml has a [patch.crates-io] block pointing at the vendored -# p3-goldilocks-patched (adds BinomiallyExtendable<3>, disables NEON). For the -# nightly we build against vanilla crates.io p3-goldilocks — we comment the -# block out and drop the `p3-degree3` feature. -# -# Both Cargo.toml AND Cargo.lock are backed up before the build: dropping the -# patch makes cargo re-resolve p3-goldilocks against crates.io, which rewrites -# Cargo.lock. The trap restores both so the working tree is clean on exit. -CARGO_TOML="$ROOT_DIR/Cargo.toml" -CARGO_LOCK="$ROOT_DIR/Cargo.lock" -CARGO_TOML_BAK="" -CARGO_LOCK_BAK="" -BUILD_FEATURE_FLAGS=() -if $NO_P3_PATCH; then - CARGO_TOML_BAK="$CARGO_TOML.bak.p3bench.$$" - cp "$CARGO_TOML" "$CARGO_TOML_BAK" - if [ -f "$CARGO_LOCK" ]; then - CARGO_LOCK_BAK="$CARGO_LOCK.bak.p3bench.$$" - cp "$CARGO_LOCK" "$CARGO_LOCK_BAK" - fi - # Comment the [patch.crates-io] block and its entries (until the next blank - # line or next [section]). - python3 - "$CARGO_TOML" <<'PY' -import sys, pathlib -path = pathlib.Path(sys.argv[1]) -lines = path.read_text().splitlines(keepends=True) -out = [] -in_patch = False -for ln in lines: - stripped = ln.strip() - if stripped == "[patch.crates-io]": - in_patch = True - out.append("# " + ln if not ln.startswith("#") else ln) - continue - if in_patch: - if stripped.startswith("[") and stripped.endswith("]"): - in_patch = False - out.append(ln) - continue - if stripped == "": - in_patch = False - out.append(ln) - continue - out.append("# " + ln if not ln.startswith("#") else ln) - else: - out.append(ln) -path.write_text("".join(out)) -PY - trap 'if [ -n "$CARGO_TOML_BAK" ] && [ -f "$CARGO_TOML_BAK" ]; then mv "$CARGO_TOML_BAK" "$CARGO_TOML"; fi; if [ -n "$CARGO_LOCK_BAK" ] && [ -f "$CARGO_LOCK_BAK" ]; then mv "$CARGO_LOCK_BAK" "$CARGO_LOCK"; fi' EXIT INT TERM - BUILD_FEATURE_FLAGS=(--no-default-features --features parallel) -fi - # --- Scalar (no SIMD) toggle ------------------------------------------------ -# When --scalar is on, disable vector instruction sets for the build so both -# provers run against the same scalar baseline. p3-keccak keeps SSE2 residual -# on x86 — acceptable per the bench workstream (contribution is ~7%). -# x86_64 → -avx2,-avx512f (Goldilocks + most of Keccak go scalar) -# aarch64 → -sha3 (drops Keccak NEON SHA3 extension) +# When --scalar is on, disable AVX2/AVX-512 so Goldilocks (and most of Keccak) +# run scalar for an apples-to-apples comparison against Lambda STARK. The +# residual SSE2 path on p3-keccak is intentionally left enabled — its +# contribution to total prove time is ~7%. # Cargo caches per-RUSTFLAGS, so toggling scalar vs vector triggers a rebuild # on first use but is cached afterwards. SCALAR_RUSTFLAGS="" +SCALAR_ACTIVE=false if $SCALAR; then case "$(uname -m)" in x86_64|amd64) SCALAR_RUSTFLAGS="-C target-feature=-avx2,-avx512f" - ;; - arm64|aarch64) - SCALAR_RUSTFLAGS="-C target-feature=-sha3" + SCALAR_ACTIVE=true ;; *) - echo "warning: --scalar: unknown arch $(uname -m); not pinning RUSTFLAGS" >&2 + echo "warning: --scalar: only supported on x86_64; host is $(uname -m), not pinning RUSTFLAGS" >&2 ;; esac if [ -n "$SCALAR_RUSTFLAGS" ]; then @@ -210,24 +149,19 @@ echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}" echo -e " log-rows: ${YELLOW}${LOG_ROWS[*]}${NC}" echo -e " num-sequences: ${YELLOW}${NUM_SEQUENCES}${NC} (columns = $((2 * NUM_SEQUENCES)))" echo -e " runs/size: ${YELLOW}${RUNS}${NC} (median reported)" -if $NO_P3_PATCH; then - echo -e " p3 extension: ${YELLOW}degree 2 (vanilla, no patch)${NC}" -else - echo -e " p3 extension: ${YELLOW}degree 3 (patched, matches Lambda)${NC}" -fi -if $SCALAR; then +echo -e " p3 extension: ${YELLOW}degree 3 (forked p3-goldilocks, matches Lambda)${NC}" +if $SCALAR_ACTIVE; then echo -e " scalar mode: ${YELLOW}on${NC} (arch=$(uname -m), RUSTFLAGS=\"${RUSTFLAGS:-}\")" +elif $SCALAR; then + echo -e " scalar mode: ${YELLOW}requested (unsupported on $(uname -m))${NC} (SIMD enabled, compiler default)" else echo -e " scalar mode: ${YELLOW}off${NC} (SIMD enabled, compiler default)" fi echo "" echo -e "${GREEN}[build]${NC} prove_bench" -# Use the `${arr[@]+...}` expansion so `set -u` doesn't blow up when the -# feature-flag array is empty (bash 3 on macOS). cargo build --release -p bench-vs-plonky3 --bin prove_bench \ - --manifest-path "$ROOT_DIR/Cargo.toml" \ - ${BUILD_FEATURE_FLAGS[@]+"${BUILD_FEATURE_FLAGS[@]}"} 2>&1 | tail -5 + --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -5 # Resolve the actual target directory via cargo metadata so we find the binary # whether cargo used ./target/ (default) or a custom CARGO_TARGET_DIR. @@ -378,10 +312,6 @@ echo "" if $RUN_LAMBDA && $RUN_P3; then echo -e "Timing window: single-shot end-to-end prove." fi -if $NO_P3_PATCH; then - echo -e "${YELLOW}Note:${NC} Plonky3 was built without the degree-3 patch; Challenge type is degree-2." - echo -e " Lambda keeps degree-3 — extension fields differ across sides." -fi # --- Machine-readable report ------------------------------------------------ @@ -428,10 +358,14 @@ if [ -n "$REPORT_DIR" ]; then echo "fri_queries=219" echo "grinding=0" echo "runs_per_size=$RUNS" - echo "p3_extension=$($NO_P3_PATCH && echo 'degree2_vanilla' || echo 'degree3_patched')" - echo "scalar=$($SCALAR && echo on || echo off)" - if $SCALAR && [ -n "$SCALAR_RUSTFLAGS" ]; then + echo "p3_extension=degree3_fork" + if $SCALAR_ACTIVE; then + echo "scalar=on" echo "rustflags=$SCALAR_RUSTFLAGS" + elif $SCALAR; then + echo "scalar=requested_unsupported" + else + echo "scalar=off" fi echo "timing_window=single_shot_end_to_end_prove_no_verify" echo "log_rows_series=$(join_slash "${RESULT_LOG_ROWS[@]}")" diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs index b74f18ad2..cc57a3e5d 100644 --- a/bench_vs_plonky3/src/plonky3_config.rs +++ b/bench_vs_plonky3/src/plonky3_config.rs @@ -11,20 +11,11 @@ use p3_uni_stark::StarkConfig; pub type Val = Goldilocks; -/// Cubic extension (default, `p3-degree3` feature): matches Lambda's -/// `Degree3GoldilocksExtensionField`, irreducible x^3 - 2. Needs the vendored -/// `p3-goldilocks-patched` crate (enabled via root `[patch.crates-io]`). -#[cfg(feature = "p3-degree3")] +/// Cubic extension matching Lambda's `Degree3GoldilocksExtensionField` +/// (irreducible x^3 - 2). Provided by the forked `p3-goldilocks` via +/// `BinomiallyExtendable<3>`. pub type Challenge = BinomialExtensionField; -/// Quadratic extension (vanilla upstream p3-goldilocks 0.5.2). Compiled when -/// `p3-degree3` is disabled, typically together with commenting the root -/// `[patch.crates-io]` block. Lambda still runs degree 3, so this is NOT a -/// fair comparison on the extension field — it is used for nightly tracking -/// against the off-the-shelf P3 config. -#[cfg(not(feature = "p3-degree3"))] -pub type Challenge = BinomialExtensionField; - type ByteHash = Keccak256Hash; type U64Hash = PaddingFreeSponge; type FieldHash = SerializingHasher; @@ -85,7 +76,7 @@ pub fn plonky3_benchmark_config() -> P3Config { let dft = Dft::default(); let challenger = Challenger::from_hasher(vec![], byte_hash); - let fri_params = p3_fri::create_benchmark_fri_params(challenge_mmcs); + let fri_params = FriParameters::new_benchmark(challenge_mmcs); let pcs = Pcs::new(dft, val_mmcs, fri_params); P3Config::new(pcs, challenger) From abaa0a2f2d25bbc86e253bea609a8b38979dbcff Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Mon, 20 Apr 2026 11:42:55 -0300 Subject: [PATCH 23/34] use ssh for p3 fork --- bench_vs_plonky3/Cargo.toml | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml index 5b313106f..239abf316 100644 --- a/bench_vs_plonky3/Cargo.toml +++ b/bench_vs_plonky3/Cargo.toml @@ -7,7 +7,10 @@ edition = "2024" # Lambda STARK stark = { path = "../crypto/stark", features = ["test-utils"] } crypto = { path = "../crypto/crypto", features = ["std", "serde"] } -math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"] } +math = { path = "../crypto/math", features = [ + "std", + "lambdaworks-serde-binary", +] } # Plonky3: pinned to the yetanotherco fork, branch `feat/goldilocks_deg3`. # The branch adds BinomiallyExtendable<3> for Goldilocks (x^3 - 2), matching @@ -15,18 +18,22 @@ math = { path = "../crypto/math", features = ["std", "lambdaworks-serde-binary"] # the same git source + ref; declaring any of them as a crates.io dep would # pull in a second incompatible p3-field. cargo clones the fork once into # ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time. -p3-air = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-field = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-goldilocks = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-matrix = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-commit = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-challenger = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-symmetric = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-keccak = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-fri = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-uni-stark = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] } -p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = ["parallel"] } +p3-air = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-field = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-goldilocks = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-matrix = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-commit = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-challenger = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-symmetric = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-merkle-tree = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-keccak = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-fri = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-uni-stark = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ + "parallel", +] } +p3-dft = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ + "parallel", +] } # Tracing for P3 span-based profiling tracing = "0.1" From d3d41b0f5ff282b974d4aa5434eb579ea90062b8 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Mon, 20 Apr 2026 16:30:34 -0300 Subject: [PATCH 24/34] rm old files --- bench_vs_plonky3/ANALYSIS_LOG.md | 432 ---------------------------- bench_vs_plonky3/INSTRUMENTATION.md | 189 ------------ bench_vs_plonky3/README.md | 20 +- 3 files changed, 4 insertions(+), 637 deletions(-) delete mode 100644 bench_vs_plonky3/ANALYSIS_LOG.md delete mode 100644 bench_vs_plonky3/INSTRUMENTATION.md diff --git a/bench_vs_plonky3/ANALYSIS_LOG.md b/bench_vs_plonky3/ANALYSIS_LOG.md deleted file mode 100644 index ab19e9a1f..000000000 --- a/bench_vs_plonky3/ANALYSIS_LOG.md +++ /dev/null @@ -1,432 +0,0 @@ -# Lambda STARK vs Plonky3 — Analysis Log - -## Session: 2026-04-14 to 2026-04-16 - ---- - -## 0. Final Server Baseline (2026-04-16) - -**Config:** blowup=2, 219 queries, grinding=0, ext degree 3 both, scalar (no AVX2), parallel (rayon both), identical AIR (32 cols × 2^18). - -**Command:** `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench -p bench-vs-plonky3` - -### Prove - -| Prover | Time | Throughput | -|--------|------|------------| -| Lambda | **1.213 s** | 6.92 Melem/s | -| Plonky3 | **479 ms** | 17.50 Melem/s | -| **Ratio** | **2.53×** | | - -### Verify - -| Prover | Time | -|--------|------| -| Lambda | **23.3 ms** | -| Plonky3 | **20.4 ms** | -| **Ratio** | **1.14×** | - -### Gap attribution (734ms = 1213 - 479) - -Extension field is MATCHED (both degree 3). The 2.53× is pure algorithm/implementation: - -| Cause | Est. savings | % of gap | Effort | -|-------|-------------|----------|--------| -| **Quotient domain eval** (2^18 vs 2^19 LDE) | ~220ms | 30% | Low | -| **Batched FFT** (coset_lde_batch vs per-column) | ~150ms | 20% | Medium | -| **Alpha decomposition + monomorphization** | ~100ms | 14% | Medium-High | -| **FRI folding parallel** | ~73ms | 10% | Very low | -| **Boundary selectors** (vs zerofier precompute) | ~45ms | 6% | Low | -| **Memory allocation patterns** | ~37ms | 5% | Low | -| **SSE2 Keccak residual** (~7% hash advantage) | ~50ms | 7% | N/A (can't fix) | -| Other (compilation, unrolling, tuning) | ~59ms | 8% | - | - -### Predicted instruments breakdown (blowup=2, 219q) - -| Phase | Predicted time | % | -|-------|---------------|---| -| FRI queries (R4) | 180ms | 15% ← NEW bottleneck (2.19× queries) | -| R2 constraint eval | 168ms | 14% | -| R4 deep comp poly | 131ms | 11% | -| R1 Main Merkle | 105ms | 9% | -| R4 FRI commit | 76ms | 6% | -| R1 reconstruct LDE | 71ms | 6% | -| R3 OOD eval | 71ms | 6% | -| R1 Main LDE | 65ms | 5% | -| R4 deep extend | 52ms | 4% | -| R2 comp Merkle | 13ms | 1% | -| Pre-pass | 11ms | 1% | - -### Optimization roadmap (ranked by impact/effort) - -| # | Optimization | Savings | Effort | Result | -|---|-------------|---------|--------|--------| -| 1 | Quotient domain (stride=blowup in evaluator) | ~80ms | 1h | 1.13s | -| 2 | Parallel FRI fold (par_iter) | ~40ms | 30min | 1.09s | -| 3 | Boundary selectors (replace zerofier precompute) | ~45ms | 2h | 1.05s | -| 4 | LogUp alpha precompute | ~10ms | 30min | 1.04s | -| 5 | Monomorphize constraints (enum dispatch) | ~35ms | 4h | 1.00s | -| 6 | Batched FFT (coset_lde_batch pattern) | ~150ms | 8h | 0.85s | -| 7 | Row-major trace storage | ~20ms | 8h | 0.83s | - -**With items 1-5 (~210ms, ~8h work):** Lambda ~1.0s vs Plonky3 0.48s = **2.08×** -**With items 1-7 (~380ms, ~24h work):** Lambda ~0.83s vs Plonky3 0.48s = **1.73×** -**Remaining gap** after all: ~350ms from SSE2 Keccak + deep comp + Plonky3 micro-optimizations - -### M1 instruments breakdown (with PR #492, blowup=2, ext3 both) - -**Command:** `RUSTFLAGS="-C target-feature=-sha3" cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture` - -| Fase | Lambda (1.068s) | % | Plonky3 (352ms) | % | Ratio | -|------|-----------------|---|-----------------|---|-------| -| Trace commit (LDE+Merkle) | 317ms (LDE 127 + Merkle 165) | 30% | 138ms (commit to trace data) | 39% | 2.3× | -| **Constraint eval** | **325ms** | **30%** | **50ms** (quotient_values) | **14%** | **6.5×** | -| Quotient commit | 53ms | 5% | 49ms | 14% | 1.1× | -| OOD eval | 62ms | 6% | ~10ms (Lagrange interp) | 3% | 6.2× | -| Deep comp poly | 173ms | 16% | (inside "open") | | | -| Deep extend | 36ms | 3% | | | | -| FRI commit (folding+Merkle) | 83ms | 8% | 47ms (commit phase) | 13% | 1.8× | -| FRI queries | 1ms | 0% | 2ms (query phase) | 1% | — | -| Open total | 293ms | 27% | 110ms | 31% | 2.7× | -| Pre-pass | 7ms | 1% | — | | | - ---- - -## Fairness Audit - -### AIR equivalence: VERIFIED - -Both AIRs prove the same mathematical statement: -- 32 cols × 2^18 rows, 2-row window -- Constraint 1: `next_left = local_left + local_right` -- Constraint 2: `next_right = local_right + next_left` -- Boundary: row 0 pins `(a_s, b_s) = (s+1, s+2)` per sequence -- Test `lambda_pair_trace_matches_plonky3_trace` verifies ALL cells (not subset) -- Mathematical trace for seq (1,2): (1,2)→(3,5)→(8,13)→(21,34) — identical both sides - -### Parameters: ALL MATCHED (except noted) - -| Parameter | Lambda | Plonky3 | Status | -|-----------|--------|---------|--------| -| Base field | Goldilocks | Goldilocks | ✅ | -| Extension | degree 3 (`x³−2`) | degree 3 (`x³−2`, vendored) | ✅ | -| Blowup | 2 | 2 (log_blowup=1) | ✅ | -| FRI queries | 219 | 219 | ✅ | -| Grinding | 0 | 0 | ✅ | -| Hash | Keccak-256 | Keccak-256 | ✅ | -| Rayon | ON | ON (p3-uni-stark/parallel + p3-dft/parallel) | ✅ | -| SIMD Goldilocks | OFF | OFF (NEON patched to `Self`) | ✅ | -| SIMD Keccak (x86) | scalar (sha3 crate) | SSE2 2-wide | ⚠️ residual | -| SIMD Keccak (M1 with -sha3) | scalar | scalar (fallback) | ✅ | - -### Platform fairness guide - -| Platform | Command | Keccak P3 | Goldilocks P3 | Fairness | -|----------|---------|-----------|---------------|----------| -| **M1 + `-sha3`** | `RUSTFLAGS="-C target-feature=-sha3" cargo bench ...` | Scalar | Scalar | **100% fair** | -| M1 no flags | `cargo bench ...` | NEON SHA3 HW | Scalar | P3 has Keccak HW | -| **x86 + `-avx2,-avx512f`** | `RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ...` | SSE2 2-wide | Scalar | ~93% fair | -| x86 no flags | `cargo bench ...` | AVX2 4-wide | AVX2 4-wide | P3 has full SIMD | - -**For fairest comparison: M1 with `-sha3`** — only platform where everything is scalar both sides. - -### Security model asymmetry (doesn't affect compute, affects interpretation) - -- **Lambda (Johnson Bound, proven):** 219 queries × 0.49 bits/query = **~108 bits** proven security -- **Plonky3 (ethSTARK conjecture):** 219 queries × 1.0 bit/query = **~219 bits** conjectured (cap 192 by field) -- Same 219 queries = same computational work. Different security interpretation. -- For "matched security" at 108 conjectured bits, P3 would need only ~108 queries (half the FRI work) - -### What's NOT unfairness (architectural differences = what we measure) - -These are implementation choices, not benchmark bias: -- Quotient domain eval (P3) vs full LDE eval (Lambda) → 6.5× constraint eval -- Monomorphization (P3) vs vtable dispatch (Lambda) → ~1.2× overhead -- Batched FFT (P3) vs per-column (Lambda) → ~2× trace commit -- Row-major (P3) vs column-major (Lambda) → cache efficiency -- Boundary selectors (P3) vs zerofier precompute (Lambda) → ~2× boundary cost - -### What IS potential unfairness - -1. SSE2 Keccak on x86 — P3 gets 2-wide Keccak, Lambda doesn't. ~7% of total. Unavoidable on x86. -2. Lambda samples NO extra LogUp/bus challenges for this AIR (verified: `has_aux_trace() = false` skips sampling). -3. Lambda wraps in `multi_prove` with vec of 1 — transcript clone overhead is negligible. - -**Conclusion: The benchmark is fair for comparing prover implementation efficiency.** - ---- - -## 1. Benchmark Setup - -### AIR (identical both sides) -- 16 Fibonacci sequences, 2 cols/sequence = **32 columns** -- **2^18 rows** (each row packs 2 Fibonacci steps → 2^19 effective steps) -- 2-row window: `next.left = local.left + local.right`, `next.right = local.right + next.left` -- 32 boundary constraints pinning initial values via public inputs -- Test `lambda_pair_trace_matches_plonky3_trace` verifies cell-by-cell equivalence - -### Matched parameters -- Base field: Goldilocks (p = 2^64 − 2^32 + 1) -- Blowup: 4 -- FRI queries: 100 -- Grinding: 0 -- Hash: Keccak-256 (scalar on both sides when `-C target-feature=-sha3`) - -### Unmatched (architectural) -- **Extension field:** Lambda degree 3 (`x^3 - 2`, 192-bit), Plonky3 degree 2 (`x^2 - 7`, 128-bit) - - Plonky3 0.5.2 has Goldilocks extensions for degree 2 and 5, but NOT degree 3 - - Lambda ext-mul: 9 base muls + 3 reduce128 - - Plonky3 ext-mul: 4 base muls + 2 adds -- **Prover architecture:** Lambda multi_prove (even for 1 AIR), Plonky3 uni-stark - -### Patches applied -1. `bench_vs_plonky3/vendor-p3-goldilocks/` — `Packing = Self` on aarch64 (disables NEON) -2. `p3-uni-stark` and `p3-dft` features `["parallel"]` enabled -3. `stark` feature `parallel` enabled by default in bench - -### Files -- `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` — Lambda AIR matching P3 shape -- `bench_vs_plonky3/src/plonky3_fibonacci.rs` — Plonky3 AIR -- `bench_vs_plonky3/src/plonky3_config.rs` — P3 config (matched FRI params) -- `bench_vs_plonky3/benches/stark_comparison.rs` — Criterion benchmark -- `bench_vs_plonky3/vendor-p3-goldilocks/` — Patched p3-goldilocks (no NEON) -- Root `Cargo.toml` — `[patch.crates-io]` for vendor p3-goldilocks - ---- - -## 2. Measurements - -### Config A: Both rayon, no SIMD, no SHA3 HW (M1 Max) - -Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3` - -| | Lambda | Plonky3 | Ratio | -|--|--------|---------|-------| -| **Prove** | **2.09s** [1.99, 2.20] | **0.86s** [0.84, 0.87] | **P3 2.43× faster** | -| **Verify** | **6.58ms** | **6.76ms** | **Lambda 1.03× faster** | - -### Config B: Lambda rayon ON, Plonky3 rayon OFF, NEON ON (M1 — earlier run) - -Command: `RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3` (before adding p3 parallel features) - -| | Lambda | Plonky3 | Ratio | -|--|--------|---------|-------| -| **Prove** | **3.46s** | **2.92s** | **P3 1.18× faster** | - -### Config C: Lambda rayon ON, Plonky3 rayon OFF, NEON ON, SHA3 HW ON (M1 — first run) - -Command: `cargo bench -p bench-vs-plonky3` (no RUSTFLAGS) - -| | Lambda | Plonky3 | Ratio | -|--|--------|---------|-------| -| **Prove** | **3.21s** | **1.67s** | **P3 1.92× faster** | - -### Server instruments breakdown (Lambda only, 16 cols × 2^18 pair AIR) - -Total: **1.246s** - -| Phase | Time | % | -|-------|------|---| -| R2 constraint eval | 336ms | 27% | -| R1 Main Merkle | 211ms | 17% | -| R1 reconstruct (re-LDE) | 143ms | 11% | -| R4 deep comp poly | 131ms | 11% | -| R1 Main LDE | 130ms | 10% | -| R4 FRI commit | 80ms | 6% | -| R3 OOD eval | 71ms | 6% | -| R2 comp Merkle | 54ms | 4% | -| R4 deep extend | 43ms | 3% | -| Pre-pass | 11ms | 1% | - ---- - -## 3. Root Cause Analysis - -### Why Plonky3 is ~2.4× faster (Config A) - -#### 3a. Constraint eval domain: 4× overhead (biggest factor) -- Lambda evaluates constraints on full LDE domain: `N × blowup = 2^20 points` (`evaluator.rs:274`) -- Plonky3 evaluates on quotient domain: `N = 2^18 points`, then extends via iFFT + FFT -- Lambda does 4× more constraint evaluations (each involving ext-field ops, frame fill, zerofier division) -- **Estimated contribution: 1.5-2× of the gap** - -#### 3b. Extension field degree 3 vs 2 -- Lambda: 9 base muls per ext-mul (`extensions_goldilocks.rs:293-309`) -- Plonky3: 4 base muls per ext-mul (`binomial_extension.rs:747-762`) -- Affects: composition poly, FRI folding, DEEP openings, OOD -- **Estimated contribution: 1.3-1.5× of the gap** - -#### 3c. Virtual dispatch vs monomorphization -- Lambda: `Vec>` → vtable call per constraint per point (`traits.rs:248-250`) -- Plonky3: `air.eval(&mut folder)` → monomorphized, all constraints inlined -- For 32 constraints × 2^20 points = 32M vtable dispatches in Lambda -- **Estimated contribution: 1.1-1.2× of the gap** - -#### 3d. Data layout: column-major vs row-major -- Lambda: column-major (cache miss per column access in constraint loop) -- Plonky3: row-major (contiguous data per row) -- **Estimated contribution: 1.05-1.1× of the gap** - -#### 3e. FRI folding sequential vs parallel -- Lambda: sequential loop in `fold_evaluations_in_place` (`fri_functions.rs:21`) -- Plonky3: `par_rows()` parallelized -- **Estimated contribution: 1.03-1.05× of the gap** - -#### Combined: 1.5 × 1.4 × 1.15 × 1.07 × 1.04 ≈ **2.7× (close to measured 2.43×)** - -### Why verify is roughly equal -- Verify doesn't do LDE, Merkle, or constraint eval -- Only ~100 point openings + FRI check -- Extension field penalty minimal at small N -- Lambda's implementation is competitive on this path - ---- - -## 4. SIMD Analysis (from profiling session) - -### NEON (aarch64/M1) -- `target_feature="neon"` and `target_feature="sha3"` are **default on aarch64-apple-darwin** -- Plonky3 uses `PackedGoldilocksNeon` (WIDTH=2) unconditionally on aarch64 via `#[cfg(target_arch = "aarch64")]` -- Plonky3 Keccak uses NEON SHA3 instructions (`veor3q_u64`, `vbcaxq_u64`, etc.) -- Lambda has NO SIMD in the prover -- **Goldilocks NEON base-field mul is 0.92× SLOWER** than scalar (no native 64×64→128 on NEON) -- **Fp3 NEON mul is 1.40× faster** (parallelism helps with 3 components) -- **FFT with SIMD was 0.88× (slower)** due to pack/unpack overhead - -### Disabling SIMD -- NEON packing: patched via `vendor-p3-goldilocks` (`type Packing = Self` on aarch64) -- SHA3 hardware Keccak: `-C target-feature=-sha3` (RUSTFLAGS) -- Cannot disable NEON via RUSTFLAGS alone (intrinsics used without `#[target_feature]` annotation) - -### x86_64 (server) -- Without `-C target-cpu=native`: only SSE2 (no AVX2) → Plonky3 scalar too -- With AVX2: `PackedGoldilocksAVX2` (WIDTH=4) — has native `mulq` so SIMD IS beneficial -- For fair scalar comparison on x86: `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` - ---- - -## 5. Plonky3 Parallelism - -- `p3-maybe-rayon` feature `parallel` is NOT enabled by default -- Without it, all `par_iter()` calls fall back to `core::iter` (sequential) -- `Radix2DitParallel` is "parallel" in name only without the feature -- Must explicitly enable: `p3-uni-stark = { version = "0.5.2", features = ["parallel"] }` + `p3-dft = ...` -- Verified via `cargo tree -e features | grep p3-maybe-rayon` - ---- - -## 6. Lambda Profiling Results (server, profile_prover, 2^20 × 16 cols) - -### Single-threaded (38.7s) -| Component | % | Category | -|-----------|---|----------| -| Constraint evaluation | 32.1% | Compute | -| Keccak hashing | 15.1% | Hashing | -| Deep composition poly | 14.0% | Compute | -| Merkle tree build | 12.0% | Hashing | -| Field multiplication | 11.1% | Compute | -| FFT | 10.5% | FFT | -| Other | 5.2% | | - -### Parallel (12 threads, 19.2s — 2.02× speedup) -| Metric | Value | -|--------|-------| -| Parallel efficiency | 16.8% of ideal 12× | -| CPU utilization | 30.6% | -| Main thread work | 13.3s | -| Worker thread work | ~5s each | -| New #1 bottleneck | Keccak (16.7%) | - -### Key profiling findings -- 100% CPU-bound (no memory/IO stalls) -- SIMD PackedGoldilocks types exist but are NOT used by prover -- Iterator overhead (Map::fold + FnMut): 7.6% -- Memory allocation overhead: 8.9% (page faults + malloc + cfree) -- Amdahl's Law: ~34% serial portion limits parallel speedup - ---- - -## 7. Optimizations Implemented (then stashed) - -### Item 2: Parallel FRI folding -- File: `crypto/stark/src/fri/fri_functions.rs` -- Change: `(0..half).into_par_iter().map().collect()` with `#[cfg(feature = "parallel")]` -- Also: `crypto/stark/src/fri/mod.rs` — added `Send + Sync` bounds -- Tests: 450/450 passed (121 stark + 326 VM + 3 bench) - -### Item 3: Quotient domain constraint evaluation -- File: `crypto/stark/src/constraints/evaluator.rs` — added `lde_stride: usize` parameter -- File: `crypto/stark/src/prover.rs` — when `number_of_parts == 1`, uses `lde_stride = blowup_factor` - then extends N evaluations to LDE via `interpolate_offset_fft + evaluate_polynomial_on_lde_domain` -- Tests: 450/450 passed -- Impact on M1: 2.09s → 2.02s (~3%, within Criterion noise) -- Impact limited because iFFT+FFT extension cost offsets constraint eval savings - -### Why stashed -User wants clean baseline first (fair comparison), then optimize. These changes are ready to re-apply. - ---- - -## 8. Optimization Priority (from profiling data) - -### With parallel enabled (real-world scenario) - -| # | Optimization | Impact (parallel) | Effort | Status | -|---|-------------|-------------------|--------|--------| -| 1 | PR 492 (LDE cache) | 5-8% (reduces serial) | Done (PR open) | Waiting merge | -| 2 | BLAKE3 hash | ~12% (Keccak is parallel bottleneck) | Low | Not started | -| 3 | Quotient domain eval | 3-5% (constraint eval parallelized already) | Medium | Implemented, stashed | -| 4 | Reduce allocations | 5-8% | Medium | Not started | -| 5 | Parallel FRI fold | ~3% | Low | Implemented, stashed | -| 6 | Monomorphize constraints | 3-5% | High | Not started | - -### Plonky3 degree-3 extension (Option C) -- Would eliminate the last asymmetric variable in the comparison -- Requires implementing `BinomiallyExtendable<3>` for Goldilocks in vendored crate -- Need Sage computation for: `DTH_ROOT = 2^((p-1)/3)`, `EXT_GENERATOR` -- Expected: gap drops from 2.43× to ~1.5-1.7× (confirms extension degree accounts for ~40% of gap) - ---- - -## 9. How to Run - -### M1 / aarch64 (scalar comparison) -```bash -RUSTFLAGS="-C target-feature=-sha3" cargo bench -p bench-vs-plonky3 -``` - -### x86_64 server (scalar comparison, no AVX2) -```bash -cargo bench -p bench-vs-plonky3 -# or explicitly: RUSTFLAGS="-C target-feature=-avx2,-avx512f" cargo bench ... -``` - -### With instruments (Lambda phase breakdown) -```bash -# Add "instruments" to stark features in bench_vs_plonky3/Cargo.toml first -cargo bench -p bench-vs-plonky3 --features stark/instruments -``` - -### Verify correctness -```bash -cargo test -p bench-vs-plonky3 # 3 tests -cargo test -p stark --lib # 121 tests -cargo test -p lambda-vm-prover # 326 tests -``` - ---- - -## 10. Key Files Reference - -| File | Purpose | -|------|---------| -| `bench_vs_plonky3/src/lambda_fibonacci_pair.rs` | Lambda AIR (32 cols, 2-row window) | -| `bench_vs_plonky3/src/plonky3_fibonacci.rs` | Plonky3 AIR (matching) | -| `bench_vs_plonky3/src/plonky3_config.rs` | P3 config (FRI params matched) | -| `bench_vs_plonky3/benches/stark_comparison.rs` | Criterion benchmark | -| `bench_vs_plonky3/vendor-p3-goldilocks/` | Patched p3-goldilocks (no NEON) | -| `crypto/stark/src/constraints/evaluator.rs` | Constraint eval loop (bottleneck) | -| `crypto/stark/src/prover.rs` | Prover pipeline (Round 1-4) | -| `crypto/stark/src/fri/fri_functions.rs` | FRI folding | -| `crypto/stark/src/domain.rs` | LDE domain definition | -| `crypto/math/src/fft/polynomial.rs` | FFT / coset_lde_full_expand | diff --git a/bench_vs_plonky3/INSTRUMENTATION.md b/bench_vs_plonky3/INSTRUMENTATION.md deleted file mode 100644 index b7b6bd4b1..000000000 --- a/bench_vs_plonky3/INSTRUMENTATION.md +++ /dev/null @@ -1,189 +0,0 @@ -# `bench_vs_plonky3` — puntos de instrumentación - -Guía de referencia para revisores / handoff. Describe **dónde está cada timer -y qué mide** en la comparación Lambda STARK vs Plonky3. No describe el AIR -en sí (eso vive en `ANALYSIS_LOG.md`). - -## Cómo correrlo - -El test que imprime el breakdown se llama `instruments_breakdown`. Hay que -compilar con la feature `instruments` y pasar `--nocapture` porque la salida -va a stdout (si no, `cargo test` se la come). - -**x86 (Goldilocks scalar, SSE2 Keccak residual en P3):** - -```bash -RUSTFLAGS="-C target-feature=-avx2,-avx512f" \ -cargo test -p bench-vs-plonky3 --features instruments --release -- \ - instruments_breakdown --nocapture -``` - -## Entrada principal - -- Archivo: `bench_vs_plonky3/src/lib.rs` -- Función: `instruments_breakdown` (línea 82) -- AIR Fibonacci fijo: - - `num_sequences = 16` - - `rows = 1 << 18` (2^18) - - columns = 32 (2 por secuencia) - - `blowup_factor = 2` - - `fri_number_of_queries = 219` - - `grinding_factor = 0` - -El test hace dos pasadas independientes: - -1. Corre Lambda STARK con los timers internos del crate `stark` (feature - `instruments`). -2. Corre Plonky3 con un `tracing_subscriber` custom que captura spans. - -## Feature flags - -`bench_vs_plonky3/Cargo.toml` (líneas 33-40): - -```toml -[features] -default = ["parallel"] -parallel = ["stark/parallel"] -instruments = ["stark/instruments"] -``` - -`crypto/stark/Cargo.toml` (líneas 35-41): - -```toml -[features] -instruments = [] # prints de timing en prover/verifier -parallel = ["dep:rayon", "crypto/parallel"] -``` - -`instruments` y `parallel` **coexisten** (no son excluyentes). En la práctica -los benchmarks corren siempre con ambos activos: Plonky3 usa -`Radix2DitParallel` (rayon) unconditionally, así que Lambda también tiene que -correr en paralelo para comparar apples-to-apples. - -## Lambda: estructuras de timing - -`crypto/stark/src/instruments.rs`. - -### `MultiProveTiming` (líneas 40-50) - -Recolectada dentro de `multi_prove` y consumida por el test vía -`stark::instruments::take()`. - -| Campo | Qué mide | -|---|---| -| `prepass` | Construcción de domains + `LdeTwiddles` caches. | -| `main_commits` | Round 1 Phase A: commit de todos los main traces. | -| `aux_build` | Round 1 Phase B: construcción de aux traces / LogUp. | -| `aux_commit` | Round 1 Phase B: LDE + Merkle commit de aux traces. | -| `rounds_2_4` | Tiempo total de Rounds 2-4 (todas las tablas). | -| `round1_sub` | Sub-op breakdown de Round 1 (`Round1SubOps`). | -| `table_timings` | Por tabla: `(name, rows, duration, TableSubOps)`. | - -### `Round1SubOps` (líneas 28-37) - -Sub-ops dentro de Round 1. Se acumulan en `AtomicU64`, así que workers rayon -las pueden incrementar en paralelo sin perder datos. - -| Campo | Qué mide | -|---|---| -| `main_lde` | Main trace: `expand_columns_to_lde` (LDE/FFT). | -| `main_merkle` | Main trace: `commit_columns_bit_reversed` (Merkle). | -| `aux_lde` | Aux trace: `expand_columns_to_lde`. | -| `aux_merkle` | Aux trace: `commit_columns_bit_reversed`. | - -### `TableSubOps` (líneas 7-24) - -Por tabla, dentro de Rounds 2-4. Las partes de R2/R4 se pasan por -thread-locals (`R2_SUB`, `R4_SUB`) y después se ensamblan en -`prove_rounds_2_to_4` (ver más abajo). - -| Campo | Round | Qué mide | -|---|---|---| -| `constraints` | R2 | `evaluator.evaluate()` — constraints sobre dominio LDE. | -| `comp_decompose` | R2 | `decompose_and_extend_d2` — iFFT + extensión del composition poly. | -| `comp_commit` | R2 | Merkle commit del composition poly. | -| `ood` | R3 | Barycentric OOD eval (ver nota sobre dónde se captura). | -| `deep_comp` | R4 | `compute_deep_composition_poly_evaluations`. | -| `deep_extend` | R4 | `interpolate_fft` + `evaluate_fft` para extender el deep comp poly. | -| `fri_commit` | R4 | `fri::commit_phase_from_evaluations` (folds + Merkle layers). | -| `queries` | R4 | Grinding (si hay) + sampling + FRI query phase + Merkle openings. | - -### Dónde se capturan (en `crypto/stark/src/prover.rs`) - -- `multi_prove` (línea 1490): - - `reset_all()` (1502). - - `prepass` timer (1515-1533). - - `main_commits` timer (1541-…). - - `aux_build`, `aux_commit` timers (durante Round 1 Phase B). - - `rounds_2_4` timer; al final: `store(MultiProveTiming)`. -- `round_2_compute_composition_polynomial` — `constraints` / `comp_decompose` / - `comp_commit` (vía `store_r2_sub`). -- `prove_rounds_2_to_4` — **acá** se captura el OOD: - `round_3_dur = t_r3.elapsed()` en líneas 1957-1967, y se guarda en - `TableSubOps.ood` (línea 2010). `round_3_evaluate_polynomials_in_out_of_domain_element` - **no** tiene instrumentación propia. -- `round_4_compute_and_run_fri_on_the_deep_composition_polynomial` — - `deep_comp` / `deep_extend` / `fri_commit` / `queries` - (vía `store_r4_sub`). - -## Plonky3: breakdown por spans - -Todo vive dentro de `instruments_breakdown` en `bench_vs_plonky3/src/lib.rs`, -después del bloque de Lambda. - -- Se define una `P3TimingLayer` custom (líneas 216-259) que implementa - `tracing_subscriber::Layer`: - - `on_new_span` guarda el nombre del span. - - `on_enter` guarda `Instant::now()`. - - `on_close` calcula `start.elapsed()` y lo empuja a un `Vec<(name, ms)>`. -- Se monta un subscriber con `LevelFilter::DEBUG` (línea 266) y se instala - como default **sólo durante el `p3_uni_stark::prove`** (líneas 275-280, - scope con `_guard`). -- Post-prove: orden descendente por duración (287), filtra spans con - `ms >= 0.1` (289), y calcula `(unaccounted) = total − Σspans` (293-301). - -### Qué implica el diseño - -- **La capa no filtra por crate**: captura *cualquier* span DEBUG emitido - mientras el subscriber está vivo. En la práctica sólo corre - `p3_uni_stark::prove` dentro de ese bloque, así que todos los spans que - salen son de Plonky3 — pero si alguien agrega un `#[instrument]` propio - dentro del scope del guard, también se va a contar. -- **No hay instrumentación manual de funciones de Plonky3.** La granularidad - del breakdown = spans que Plonky3 ya emite internamente. -- **Nesting / doble-conteo:** P3 tiene spans anidados (p.ej. - `prove ⊃ compute_quotient_values ⊃ evaluate_constraints`). Cada span se - cuenta una vez con su wall-clock entre `on_enter` y `on_close`, así que - **`Σspans > wall-clock` es esperable, no es un bug**. Consecuencia: - `(unaccounted) = total − Σspans` **puede quedar negativo** en presencia de - nesting — no significa que falte tiempo, significa que los spans padre se - solapan con sus hijos. El código sólo imprime `(unaccounted)` si - `> 1.0ms`, así que casos negativos se silencian. - -## Segunda capa de instrumentación (no la usa `bench_vs_plonky3`) - -Existe una capa adicional en `prover/src/instruments.rs` (líneas 54-211, -`print_report`) — orientada al ejecutor del VM (execute + trace build + AIR -construction) que además re-imprime el `MultiProveTiming` del STARK con -otro formato. `bench_vs_plonky3` **no** la invoca; sólo consume -`stark::instruments::take()` directamente. Vale la pena saberlo si buscás -timings y aparecen en logs distintos. - -## Advertencias para el revisor - -1. Lambda: timing manual, específico del pipeline `multi_prove`. Granularidad - fina pero acoplada al código — moverlo rompe los breakpoints. -2. Plonky3: span-based. Granularidad = la que P3 decida exponer. Si P3 deja - de emitir un span en una versión futura, la línea desaparece del reporte - sin previo aviso. -3. Los porcentajes de Lambda se calculan contra el **total wall-clock del - test** (no contra `rounds_2_4`), así que la suma no cierra al 100% — hay - tiempo fuera de `multi_prove` (construcción de AIR, setup). -4. Los porcentajes de Plonky3 se calculan contra **`p3_prove_dur`** (solo el - `prove`, sin setup). -5. El benchmark usa **degree 3** para la extensión de Plonky3 vía git deps a - la rama `feat/goldilocks_deg3` del fork `yetanotherco/Plonky3` (ver - `bench_vs_plonky3/Cargo.toml`), que provee `BinomiallyExtendable<3>` - para Goldilocks con el mismo irreducible `x^3 - 2` que Lambda. -6. Plataforma: x86 con `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` → - Goldilocks scalar, residual SSE2 en Keccak de P3 (~7%). diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md index fea3c8d7e..066582280 100644 --- a/bench_vs_plonky3/README.md +++ b/bench_vs_plonky3/README.md @@ -32,16 +32,6 @@ with `plonky3_config::matched_params_config`. Both AIRs are **cell-by-cell equivalent** — this is asserted by the `lambda_pair_trace_matches_plonky3_trace` test. -## Prerequisites - -- Rust stable (the crate builds with `cargo build --release`). -- No SP1 toolchain needed — there's no VM guest compilation. -- Read access to `https://github.com/yetanotherco/Plonky3.git` (branch - `feat/goldilocks_deg3`). Cargo clones it into `~/.cargo/git/db` on the - first build and `Cargo.lock` pins the SHA. The branch provides - `BinomiallyExtendable<3>` for Goldilocks (`x^3 - 2`, matching Lambda's - `Degree3GoldilocksExtensionField`). - ## Usage ```bash @@ -163,9 +153,6 @@ cargo test -p bench-vs-plonky3 --features instruments --release -- \ Spans nest (e.g. `prove ⊃ compute_quotient_values`), so Σspans > total is expected and not a bug. `(unaccounted)` can be negative from nesting. -Details of every timer (which method it wraps, where it lives) are in -[`INSTRUMENTATION.md`](INSTRUMENTATION.md). - The nightly does **not** activate this path — it would add ~1 % overhead and pollute the historical wall-clock numbers. @@ -182,6 +169,7 @@ pollute the historical wall-clock numbers. and AVX-512 so Goldilocks arithmetic is scalar on both sides. `p3-keccak`'s SSE2 path on x86 is not disabled. - **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both - sides. Security models differ (Lambda: Johnson-bound, ~108 bits; P3: - conjectured, ~192 bits) — the compute work is equivalent, the claimed - soundness is not. See `ANALYSIS_LOG.md` for the full fairness audit. + sides. Security models differ (Lambda: Johnson-bound, ~108 bits proven; + P3: conjectured, 219 queries × 1 bit = 219 bits, capped at 192 by the + cubic extension field) — the compute work is equivalent, the claimed + soundness is not. From 3010b38c353a8d2036e6c6ea27e3c4f2afb7fd78 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Mon, 20 Apr 2026 17:11:48 -0300 Subject: [PATCH 25/34] adress comments --- bench_vs_plonky3/Cargo.toml | 24 +++++------ bench_vs_plonky3/run.sh | 6 +-- bench_vs_plonky3/src/lib.rs | 56 ++++++++++++++++++++------ bench_vs_plonky3/src/plonky3_config.rs | 13 ------ 4 files changed, 58 insertions(+), 41 deletions(-) diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml index 239abf316..39f8c7330 100644 --- a/bench_vs_plonky3/Cargo.toml +++ b/bench_vs_plonky3/Cargo.toml @@ -18,20 +18,20 @@ math = { path = "../crypto/math", features = [ # the same git source + ref; declaring any of them as a crates.io dep would # pull in a second incompatible p3-field. cargo clones the fork once into # ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time. -p3-air = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-field = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-goldilocks = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-matrix = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-commit = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-challenger = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-symmetric = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-merkle-tree = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-keccak = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-fri = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-uni-stark = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ +p3-air = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-field = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-goldilocks = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-matrix = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-commit = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-challenger = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-symmetric = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-keccak = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-fri = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } +p3-uni-stark = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ "parallel", ] } -p3-dft = { git = "ssh://git@github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ +p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ "parallel", ] } diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index a23f0144d..1bdedbb5c 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -18,7 +18,8 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" -TMP_DIR="/tmp/bench_p3" +TMP_DIR="$(mktemp -d -t bench_p3.XXXXXX)" +trap 'rm -rf "$TMP_DIR"' EXIT REPORT_DIR="" NO_COLOR=false SCALAR=false @@ -109,9 +110,6 @@ if $NO_COLOR; then NC='' fi -mkdir -p "$TMP_DIR" -rm -rf "$TMP_DIR"/* - if [ -n "$REPORT_DIR" ]; then mkdir -p "$REPORT_DIR/raw" fi diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs index d61c6ea9e..7c722153e 100644 --- a/bench_vs_plonky3/src/lib.rs +++ b/bench_vs_plonky3/src/lib.rs @@ -76,8 +76,9 @@ mod tests { } /// Lambda prove with instruments breakdown + P3 span-based breakdown. - /// Run: cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --nocapture + /// Run: cargo test -p bench-vs-plonky3 --features instruments --release -- instruments_breakdown --ignored --nocapture #[test] + #[ignore = "heavy: run with --release -- instruments_breakdown --ignored --nocapture"] fn instruments_breakdown() { let num_sequences = 16; let rows = 1 << 19; @@ -211,8 +212,14 @@ mod tests { type SpanResults = Arc>>; + struct SpanState { + name: String, + active_since: Option, + accumulated: std::time::Duration, + } + struct P3TimingLayer { - spans: Mutex)>>, + spans: Mutex>, results: SpanResults, } @@ -227,19 +234,39 @@ mod tests { _ctx: tracing_subscriber::layer::Context<'_, S>, ) { let name = attrs.metadata().name().to_string(); - self.spans - .lock() - .unwrap() - .insert(id.into_u64(), (name, None)); + self.spans.lock().unwrap().insert( + id.into_u64(), + SpanState { + name, + active_since: None, + accumulated: std::time::Duration::ZERO, + }, + ); } + // Rayon can re-enter a span across threads, so only start timing on + // the first enter after each exit; accumulate every interval. fn on_enter( &self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>, ) { - if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) { - entry.1 = Some(std::time::Instant::now()); + if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) + && entry.active_since.is_none() + { + entry.active_since = Some(std::time::Instant::now()); + } + } + + fn on_exit( + &self, + id: &tracing::span::Id, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) + && let Some(start) = entry.active_since.take() + { + entry.accumulated += start.elapsed(); } } @@ -248,10 +275,15 @@ mod tests { id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>, ) { - if let Some((name, Some(start))) = self.spans.lock().unwrap().remove(&id.into_u64()) - { - let ms = start.elapsed().as_secs_f64() * 1000.0; - self.results.lock().unwrap().push((name, ms)); + if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) { + // If we never saw on_exit (span closed while active), include + // the dangling interval. + let mut total = entry.accumulated; + if let Some(start) = entry.active_since { + total += start.elapsed(); + } + let ms = total.as_secs_f64() * 1000.0; + self.results.lock().unwrap().push((entry.name, ms)); } } } diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs index cc57a3e5d..971660f37 100644 --- a/bench_vs_plonky3/src/plonky3_config.rs +++ b/bench_vs_plonky3/src/plonky3_config.rs @@ -68,16 +68,3 @@ pub fn matched_params_config() -> P3Config { let pcs = Pcs::new(dft, val_mmcs, fri_params); P3Config::new(pcs, challenger) } - -/// Creates a Plonky3 STARK config with Plonky3's standard benchmark parameters: -/// blowup=2, 100 FRI queries, 16-bit query PoW. -pub fn plonky3_benchmark_config() -> P3Config { - let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs(); - let dft = Dft::default(); - let challenger = Challenger::from_hasher(vec![], byte_hash); - - let fri_params = FriParameters::new_benchmark(challenge_mmcs); - - let pcs = Pcs::new(dft, val_mmcs, fri_params); - P3Config::new(pcs, challenger) -} From 4b19d250f80221a0e7b8b0b70bae3285897c8572 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Mon, 4 May 2026 12:09:46 -0300 Subject: [PATCH 26/34] Migrate FibonacciPair AIR constraints --- bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs index 54c704976..751e86855 100644 --- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs +++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs @@ -25,7 +25,7 @@ use math::field::{ use stark::{ constraints::{ boundary::{BoundaryConstraint, BoundaryConstraints}, - transition::TransitionConstraint, + transition::TransitionConstraintEvaluator, }, context::AirContext, proof::options::ProofOptions, @@ -61,7 +61,7 @@ where } } -impl TransitionConstraint for FibPairShiftConstraint +impl TransitionConstraintEvaluator for FibPairShiftConstraint where F: IsSubFieldOf + IsFFTField + Send + Sync, E: IsField + Send + Sync, @@ -78,7 +78,11 @@ where 1 } - fn evaluate(&self, eval_ctx: &TransitionEvaluationContext, out: &mut [FieldElement]) { + fn evaluate_verifier( + &self, + eval_ctx: &TransitionEvaluationContext, + out: &mut [FieldElement], + ) { match eval_ctx { TransitionEvaluationContext::Prover { frame, .. } => { let s0 = frame.get_evaluation_step(0); @@ -130,7 +134,7 @@ where } } -impl TransitionConstraint for FibPairSumConstraint +impl TransitionConstraintEvaluator for FibPairSumConstraint where F: IsSubFieldOf + IsFFTField + Send + Sync, E: IsField + Send + Sync, @@ -147,7 +151,11 @@ where 1 } - fn evaluate(&self, eval_ctx: &TransitionEvaluationContext, out: &mut [FieldElement]) { + fn evaluate_verifier( + &self, + eval_ctx: &TransitionEvaluationContext, + out: &mut [FieldElement], + ) { match eval_ctx { TransitionEvaluationContext::Prover { frame, .. } => { let s0 = frame.get_evaluation_step(0); @@ -184,7 +192,7 @@ where E: IsField + Send + Sync, { context: AirContext, - constraints: Vec>>, + constraints: Vec>>, num_sequences: usize, } @@ -209,7 +217,7 @@ where trace_length } - fn transition_constraints(&self) -> &Vec>> { + fn transition_constraints(&self) -> &Vec>> { &self.constraints } @@ -251,7 +259,7 @@ where E: IsField + Send + Sync + 'static, { pub fn with_num_sequences(proof_options: &ProofOptions, num_sequences: usize) -> Self { - let mut constraints: Vec>> = + let mut constraints: Vec>> = Vec::with_capacity(2 * num_sequences); for seq in 0..num_sequences { constraints.push(Box::new(FibPairShiftConstraint::new(seq, 2 * seq))); From 7ee4cbf01dea951ddd76f85decd02a5c25443185 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Mon, 4 May 2026 15:13:10 -0300 Subject: [PATCH 27/34] Add full metrics and phase breakdown to bench_vs_plonky3 --- .github/workflows/bench-vs-p3-nightly.yml | 2 +- bench_vs_plonky3/Cargo.toml | 3 + bench_vs_plonky3/run.sh | 312 +++++++++++-- bench_vs_plonky3/src/bin/prove_bench.rs | 414 +++++++++++++++++- bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 7 +- bench_vs_plonky3/src/plonky3_config.rs | 28 +- 6 files changed, 706 insertions(+), 60 deletions(-) diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml index d27bd9010..03fedad2b 100644 --- a/.github/workflows/bench-vs-p3-nightly.yml +++ b/.github/workflows/bench-vs-p3-nightly.yml @@ -34,7 +34,7 @@ jobs: bash ./bench_vs_plonky3/run.sh \ --log-rows 19 \ --num-sequences 16 \ - --runs 3 \ + --runs 10 \ --scalar \ --report-dir bench_vs_p3_artifacts \ --no-color diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml index 39f8c7330..92deaa31c 100644 --- a/bench_vs_plonky3/Cargo.toml +++ b/bench_vs_plonky3/Cargo.toml @@ -38,6 +38,9 @@ p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/g # Tracing for P3 span-based profiling tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] } +libc = "0.2" +serde = { version = "1.0", features = ["derive"] } +serde_cbor = "0.11" [dev-dependencies] criterion = { version = "0.4", default-features = false } diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index 1bdedbb5c..0098fed33 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -5,10 +5,10 @@ # Usage: # ./bench_vs_plonky3/run.sh [--log-rows K ...] [--num-sequences N] [--runs N] # [--lambda-only | --p3-only] [--report-dir DIR] -# [--scalar] [--no-color] +# [--scalar] [--breakdown] [--no-color] # -# Defaults: --log-rows 19, --num-sequences 16, --runs 3. -# With multiple --log-rows values, prints one median row per size. +# Defaults: --log-rows 19, --num-sequences 16, --runs 10. +# With multiple --log-rows values, prints one stats row per size. # # --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks (and most of Keccak) # run scalar; residual SSE2 in p3-keccak remains. Triggers a rebuild when @@ -23,6 +23,7 @@ trap 'rm -rf "$TMP_DIR"' EXIT REPORT_DIR="" NO_COLOR=false SCALAR=false +BREAKDOWN=false RED='\033[0;31m' GREEN='\033[0;32m' @@ -32,7 +33,10 @@ NC='\033[0m' LOG_ROWS=() NUM_SEQUENCES=16 -RUNS=3 +RUNS=10 +BLOWUP=2 +FRI_QUERIES=219 +GRINDING=0 RUN_LAMBDA=true RUN_P3=true @@ -73,6 +77,10 @@ while [[ $# -gt 0 ]]; do SCALAR=true shift ;; + --breakdown) + BREAKDOWN=true + shift + ;; --no-color) NO_COLOR=true shift @@ -146,8 +154,14 @@ fi echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}" echo -e " log-rows: ${YELLOW}${LOG_ROWS[*]}${NC}" echo -e " num-sequences: ${YELLOW}${NUM_SEQUENCES}${NC} (columns = $((2 * NUM_SEQUENCES)))" -echo -e " runs/size: ${YELLOW}${RUNS}${NC} (median reported)" +echo -e " runs/size: ${YELLOW}${RUNS}${NC} (median + CV reported)" echo -e " p3 extension: ${YELLOW}degree 3 (forked p3-goldilocks, matches Lambda)${NC}" +echo -e " proof params: ${YELLOW}blowup=${BLOWUP}, queries=${FRI_QUERIES}, grinding=${GRINDING}${NC}" +if $BREAKDOWN; then + echo -e " breakdown: ${YELLOW}on${NC} (Lambda instruments + P3 tracing spans)" +else + echo -e " breakdown: ${YELLOW}off${NC}" +fi if $SCALAR_ACTIVE; then echo -e " scalar mode: ${YELLOW}on${NC} (arch=$(uname -m), RUSTFLAGS=\"${RUSTFLAGS:-}\")" elif $SCALAR; then @@ -158,8 +172,11 @@ fi echo "" echo -e "${GREEN}[build]${NC} prove_bench" -cargo build --release -p bench-vs-plonky3 --bin prove_bench \ - --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -5 +BUILD_ARGS=(build --release -p bench-vs-plonky3 --bin prove_bench --manifest-path "$ROOT_DIR/Cargo.toml") +if $BREAKDOWN; then + BUILD_ARGS+=(--features instruments) +fi +cargo "${BUILD_ARGS[@]}" 2>&1 | tail -5 # Resolve the actual target directory via cargo metadata so we find the binary # whether cargo used ./target/ (default) or a custom CARGO_TARGET_DIR. @@ -182,17 +199,30 @@ extract_proving_time() { }' } +extract_metrics_line() { + sed -n '/^METRICS / { + p + q + }' +} + +metric_value() { + local line=$1 + local key=$2 + printf '%s\n' "$line" | tr '\t' '\n' | LC_ALL=C awk -F= -v key="$key" '$1 == key { print $2; exit }' +} + median_of() { - # prints median of the given numeric arguments (rounded to 3 decimals). + # prints median of the given numeric arguments. # Uses shell `sort -g` for portability (macOS awk lacks gawk's asort). printf '%s\n' "$@" | LC_ALL=C sort -g | LC_NUMERIC=C awk ' { a[NR] = $0 + 0 } END { if (NR == 0) { print "n/a"; exit } if (NR % 2 == 1) { - printf "%.3f\n", a[(NR + 1) / 2] + printf "%.6f\n", a[(NR + 1) / 2] } else { - printf "%.3f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2 + printf "%.6f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2 } }' } @@ -204,6 +234,85 @@ ratio_fmt() { }' } +mean_file() { + LC_NUMERIC=C awk '{ s += $1; n++ } END { if (n == 0) print "n/a"; else printf "%.6f\n", s / n }' "$1" +} + +median_file() { + LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk ' + { a[NR] = $0 + 0 } + END { + if (NR == 0) { print "n/a"; exit } + if (NR % 2 == 1) printf "%.6f\n", a[(NR + 1) / 2] + else printf "%.6f\n", (a[NR / 2] + a[NR / 2 + 1]) / 2 + }' +} + +stddev_file() { + LC_NUMERIC=C awk ' + { s += $1; ss += $1 * $1; n++ } + END { + if (n == 0) { print "n/a"; exit } + m = s / n + v = (ss / n) - (m * m) + if (v < 0) v = 0 + printf "%.6f\n", sqrt(v) + }' "$1" +} + +cv_pct_file() { + LC_NUMERIC=C awk ' + { s += $1; ss += $1 * $1; n++ } + END { + if (n == 0) { print "n/a"; exit } + m = s / n + v = (ss / n) - (m * m) + if (v < 0) v = 0 + sd = sqrt(v) + if (m == 0) print "n/a" + else printf "%.2f\n", sd * 100 / m + }' "$1" +} + +min_file() { + LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk 'NR == 1 { printf "%.6f\n", $1; exit }' +} + +max_file() { + LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk '{ x = $1 } END { if (NR == 0) print "n/a"; else printf "%.6f\n", x }' +} + +fmt0() { + LC_NUMERIC=C awk -v v="$1" 'BEGIN { if (v == "n/a") print v; else printf "%.0f\n", v }' +} + +metric_file_for() { + local metrics_file=$1 + local key=$2 + local out_file=$3 + : > "$out_file" + while IFS= read -r line; do + local value + value=$(metric_value "$line" "$key") + if [ -n "$value" ] && [ "$value" != "n/a" ]; then + printf '%s\n' "$value" >> "$out_file" + fi + done < "$metrics_file" +} + +median_metric() { + local prover=$1 + local log_rows=$2 + local key=$3 + local file="$TMP_DIR/${prover}_${log_rows}_${key}.values" + metric_file_for "$TMP_DIR/${prover}_${log_rows}.metrics" "$key" "$file" + if [ ! -s "$file" ]; then + printf "n/a\n" + else + median_file "$file" + fi +} + # --- Run benchmark ---------------------------------------------------------- RESULT_LOG_ROWS=() @@ -211,34 +320,58 @@ RESULT_ROWS=() RESULT_LAMBDA=() RESULT_P3=() RESULT_RATIO=() +RESULT_LAMBDA_CV=() +RESULT_P3_CV=() +RESULT_LAMBDA_VERIFY=() +RESULT_P3_VERIFY=() +RESULT_LAMBDA_PROOF_SIZE=() +RESULT_P3_PROOF_SIZE=() +RESULT_LAMBDA_RSS=() +RESULT_P3_RSS=() run_prover() { local prover=$1 # lambda | p3 local log_rows=$2 local times=() + local metrics_file="$TMP_DIR/${prover}_${log_rows}.metrics" + local breakdown_file="$TMP_DIR/${prover}_${log_rows}.breakdown" + : > "$metrics_file" + : > "$breakdown_file" for run_i in $(seq 1 "$RUNS"); do local out_file="$TMP_DIR/${prover}_${log_rows}_${run_i}.stdout" - if ! "$BIN" --prover "$prover" \ - --log-rows "$log_rows" \ - --num-sequences "$NUM_SEQUENCES" > "$out_file" 2>&1; then + local run_args=(--prover "$prover" --log-rows "$log_rows" --num-sequences "$NUM_SEQUENCES" --blowup "$BLOWUP" --queries "$FRI_QUERIES" --grinding "$GRINDING") + if $BREAKDOWN; then + run_args+=(--breakdown) + fi + if ! "$BIN" "${run_args[@]}" > "$out_file" 2>&1; then echo -e " ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}" cat "$out_file" exit 1 fi - local t - t=$(extract_proving_time < "$out_file") - if [ -z "$t" ]; then - echo -e " ${RED}[${prover}] could not parse proving time (log-rows=${log_rows}, run ${run_i})${NC}" + local metrics_line + metrics_line=$(extract_metrics_line < "$out_file") + if [ -z "$metrics_line" ]; then + echo -e " ${RED}[${prover}] could not parse metrics (log-rows=${log_rows}, run ${run_i})${NC}" cat "$out_file" exit 1 fi + printf '%s\n' "$metrics_line" >> "$metrics_file" + if $BREAKDOWN; then + sed -n "s/^BREAKDOWN /BREAKDOWN run=${run_i} /p" "$out_file" >> "$breakdown_file" + fi + + local t + t=$(metric_value "$metrics_line" prove_s) + if [ -z "$t" ]; then + t=$(extract_proving_time < "$out_file") + fi times+=("$t") if [ -n "$REPORT_DIR" ]; then cp "$out_file" "$REPORT_DIR/raw/${prover}_log${log_rows}_run${run_i}.stdout" fi done - median_of "${times[@]}" printf '%s\n' "${times[@]}" > "$TMP_DIR/${prover}_${log_rows}.times" + median_of "${times[@]}" } for lr in "${LOG_ROWS[@]}"; do @@ -247,17 +380,33 @@ for lr in "${LOG_ROWS[@]}"; do lambda_median="n/a" p3_median="n/a" + lambda_cv="n/a" + p3_cv="n/a" + lambda_verify="n/a" + p3_verify="n/a" + lambda_proof_size="n/a" + p3_proof_size="n/a" + lambda_rss="n/a" + p3_rss="n/a" if $RUN_LAMBDA; then echo -ne " ${GREEN}[lambda]${NC} " lambda_median=$(run_prover lambda "$lr") - echo -e "median ${BOLD}${lambda_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/lambda_${lr}.times")" + lambda_cv=$(cv_pct_file "$TMP_DIR/lambda_${lr}.times") + lambda_verify=$(median_metric lambda "$lr" verify_s) + lambda_proof_size=$(median_metric lambda "$lr" proof_size_bytes) + lambda_rss=$(median_metric lambda "$lr" peak_rss_kb) + echo -e "prove median ${BOLD}${lambda_median}s${NC} (CV ${lambda_cv}%), verify ${lambda_verify}s, proof $(fmt0 "$lambda_proof_size") B, rss $(fmt0 "$lambda_rss") KB" fi if $RUN_P3; then echo -ne " ${GREEN}[p3]${NC} " p3_median=$(run_prover p3 "$lr") - echo -e "median ${BOLD}${p3_median}s${NC} from $RUNS runs: $(paste -sd, "$TMP_DIR/p3_${lr}.times")" + p3_cv=$(cv_pct_file "$TMP_DIR/p3_${lr}.times") + p3_verify=$(median_metric p3 "$lr" verify_s) + p3_proof_size=$(median_metric p3 "$lr" proof_size_bytes) + p3_rss=$(median_metric p3 "$lr" peak_rss_kb) + echo -e "prove median ${BOLD}${p3_median}s${NC} (CV ${p3_cv}%), verify ${p3_verify}s, proof $(fmt0 "$p3_proof_size") B, rss $(fmt0 "$p3_rss") KB" fi local_ratio="n/a" @@ -270,6 +419,14 @@ for lr in "${LOG_ROWS[@]}"; do RESULT_LAMBDA+=("$lambda_median") RESULT_P3+=("$p3_median") RESULT_RATIO+=("$local_ratio") + RESULT_LAMBDA_CV+=("$lambda_cv") + RESULT_P3_CV+=("$p3_cv") + RESULT_LAMBDA_VERIFY+=("$lambda_verify") + RESULT_P3_VERIFY+=("$p3_verify") + RESULT_LAMBDA_PROOF_SIZE+=("$lambda_proof_size") + RESULT_P3_PROOF_SIZE+=("$p3_proof_size") + RESULT_LAMBDA_RSS+=("$lambda_rss") + RESULT_P3_RSS+=("$p3_rss") done # --- Summary table ---------------------------------------------------------- @@ -277,11 +434,11 @@ done echo "" echo -e "${BOLD}=== Summary ===${NC}" if $RUN_LAMBDA && $RUN_P3; then - printf " %-9s %-12s %14s %14s %10s\n" "log-rows" "rows" "Lambda (s)" "P3 (s)" "L/P3" - printf " %-9s %-12s %14s %14s %10s\n" "--------" "----" "----------" "------" "----" + printf " %-9s %-12s %14s %9s %14s %9s %10s\n" "log-rows" "rows" "Lambda (s)" "L CV%" "P3 (s)" "P3 CV%" "L/P3" + printf " %-9s %-12s %14s %9s %14s %9s %10s\n" "--------" "----" "----------" "-----" "------" "------" "----" else - printf " %-9s %-12s %14s\n" "log-rows" "rows" "Time (s)" - printf " %-9s %-12s %14s\n" "--------" "----" "--------" + printf " %-9s %-12s %14s %9s\n" "log-rows" "rows" "Time (s)" "CV%" + printf " %-9s %-12s %14s %9s\n" "--------" "----" "--------" "---" fi for i in "${!RESULT_LOG_ROWS[@]}"; do @@ -290,6 +447,8 @@ for i in "${!RESULT_LOG_ROWS[@]}"; do lt="${RESULT_LAMBDA[$i]}" pt="${RESULT_P3[$i]}" rt="${RESULT_RATIO[$i]}" + lcv="${RESULT_LAMBDA_CV[$i]}" + pcv="${RESULT_P3_CV[$i]}" if $RUN_LAMBDA && $RUN_P3; then color=$GREEN verdict="Lambda faster" @@ -297,18 +456,18 @@ for i in "${!RESULT_LOG_ROWS[@]}"; do color=$RED verdict="P3 faster" fi - printf " %-9s %-12s %13ss %13ss ${color}%9sx${NC} (${color}%s${NC})\n" \ - "$lr" "$rows" "$lt" "$pt" "$rt" "$verdict" + printf " %-9s %-12s %13ss %8s%% %13ss %8s%% ${color}%9sx${NC} (${color}%s${NC})\n" \ + "$lr" "$rows" "$lt" "$lcv" "$pt" "$pcv" "$rt" "$verdict" elif $RUN_LAMBDA; then - printf " %-9s %-12s %13ss\n" "$lr" "$rows" "$lt" + printf " %-9s %-12s %13ss %8s%%\n" "$lr" "$rows" "$lt" "$lcv" else - printf " %-9s %-12s %13ss\n" "$lr" "$rows" "$pt" + printf " %-9s %-12s %13ss %8s%%\n" "$lr" "$rows" "$pt" "$pcv" fi done echo "" if $RUN_LAMBDA && $RUN_P3; then - echo -e "Timing window: single-shot end-to-end prove." + echo -e "Timing window: prove only for the ratio. Verify, proof size, RSS and throughput are reported separately." fi # --- Machine-readable report ------------------------------------------------ @@ -325,18 +484,86 @@ if [ -n "$REPORT_DIR" ]; then } { - printf "log_rows\trows\tlambda_median_s\tp3_median_s\tratio_lambda_over_p3\truns\n" + printf "log_rows\trows\tlambda_prove_median_s\tlambda_prove_cv_pct\tlambda_verify_median_s\tlambda_proof_size_bytes_median\tlambda_peak_rss_kb_median\tp3_prove_median_s\tp3_prove_cv_pct\tp3_verify_median_s\tp3_proof_size_bytes_median\tp3_peak_rss_kb_median\tratio_lambda_over_p3\truns\n" for i in "${!RESULT_LOG_ROWS[@]}"; do - printf "%s\t%s\t%s\t%s\t%s\t%s\n" \ + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ "${RESULT_LOG_ROWS[$i]}" \ "${RESULT_ROWS[$i]}" \ "${RESULT_LAMBDA[$i]}" \ + "${RESULT_LAMBDA_CV[$i]}" \ + "${RESULT_LAMBDA_VERIFY[$i]}" \ + "${RESULT_LAMBDA_PROOF_SIZE[$i]}" \ + "${RESULT_LAMBDA_RSS[$i]}" \ "${RESULT_P3[$i]}" \ + "${RESULT_P3_CV[$i]}" \ + "${RESULT_P3_VERIFY[$i]}" \ + "${RESULT_P3_PROOF_SIZE[$i]}" \ + "${RESULT_P3_RSS[$i]}" \ "${RESULT_RATIO[$i]}" \ "$RUNS" done } > "$REPORT_DIR/results.tsv" + { + printf "workload\tprover\tlog_rows\trows\tnum_sequences\tmain_cols\taux_cols\ttables\tlogup\tblowup\tfri_queries\tgrinding\tprove_s\tverify_s\tproof_size_bytes\tpeak_rss_kb\trows_per_sec\tcells_per_sec\n" + for lr in "${RESULT_LOG_ROWS[@]}"; do + for prover in lambda p3; do + metrics_file="$TMP_DIR/${prover}_${lr}.metrics" + if [ ! -f "$metrics_file" ]; then + continue + fi + while IFS= read -r line; do + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ + "$(metric_value "$line" workload)" \ + "$(metric_value "$line" prover)" \ + "$(metric_value "$line" log_rows)" \ + "$(metric_value "$line" rows)" \ + "$(metric_value "$line" num_sequences)" \ + "$(metric_value "$line" main_cols)" \ + "$(metric_value "$line" aux_cols)" \ + "$(metric_value "$line" tables)" \ + "$(metric_value "$line" logup)" \ + "$(metric_value "$line" blowup)" \ + "$(metric_value "$line" fri_queries)" \ + "$(metric_value "$line" grinding)" \ + "$(metric_value "$line" prove_s)" \ + "$(metric_value "$line" verify_s)" \ + "$(metric_value "$line" proof_size_bytes)" \ + "$(metric_value "$line" peak_rss_kb)" \ + "$(metric_value "$line" rows_per_sec)" \ + "$(metric_value "$line" cells_per_sec)" + done < "$metrics_file" + done + done + } > "$REPORT_DIR/raw_metrics.tsv" + + if $BREAKDOWN; then + { + printf "run\tworkload\tprover\tlog_rows\trows\tphase\tms\ttable\ttable_rows\tspan\n" + for lr in "${RESULT_LOG_ROWS[@]}"; do + for prover in lambda p3; do + breakdown_file="$TMP_DIR/${prover}_${lr}.breakdown" + if [ ! -f "$breakdown_file" ]; then + continue + fi + while IFS= read -r line; do + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ + "$(metric_value "$line" run)" \ + "$(metric_value "$line" workload)" \ + "$(metric_value "$line" prover)" \ + "$(metric_value "$line" log_rows)" \ + "$(metric_value "$line" rows)" \ + "$(metric_value "$line" phase)" \ + "$(metric_value "$line" ms)" \ + "$(metric_value "$line" table)" \ + "$(metric_value "$line" table_rows)" \ + "$(metric_value "$line" span)" + done < "$breakdown_file" + done + done + } > "$REPORT_DIR/breakdown.tsv" + fi + # Capture commit + timestamp so the artifact is self-describing. git_sha="$(git -C "$ROOT_DIR" rev-parse HEAD 2>/dev/null || echo unknown)" git_dirty="clean" @@ -352,10 +579,15 @@ if [ -n "$REPORT_DIR" ]; then echo "arch=$(uname -m)" echo "num_sequences=$NUM_SEQUENCES" echo "columns=$((2 * NUM_SEQUENCES))" - echo "blowup=2" - echo "fri_queries=219" - echo "grinding=0" + echo "blowup=$BLOWUP" + echo "fri_queries=$FRI_QUERIES" + echo "grinding=$GRINDING" echo "runs_per_size=$RUNS" + if $BREAKDOWN; then + echo "breakdown=on" + else + echo "breakdown=off" + fi echo "p3_extension=degree3_fork" if $SCALAR_ACTIVE; then echo "scalar=on" @@ -365,11 +597,17 @@ if [ -n "$REPORT_DIR" ]; then else echo "scalar=off" fi - echo "timing_window=single_shot_end_to_end_prove_no_verify" + echo "timing_window=prove_only_ratio_verify_size_rss_reported_separately" echo "log_rows_series=$(join_slash "${RESULT_LOG_ROWS[@]}")" echo "rows_series=$(join_slash "${RESULT_ROWS[@]}")" - echo "lambda_medians=$(join_slash "${RESULT_LAMBDA[@]}")" - echo "p3_medians=$(join_slash "${RESULT_P3[@]}")" + echo "lambda_prove_medians=$(join_slash "${RESULT_LAMBDA[@]}")" + echo "p3_prove_medians=$(join_slash "${RESULT_P3[@]}")" + echo "lambda_verify_medians=$(join_slash "${RESULT_LAMBDA_VERIFY[@]}")" + echo "p3_verify_medians=$(join_slash "${RESULT_P3_VERIFY[@]}")" + echo "lambda_proof_size_medians=$(join_slash "${RESULT_LAMBDA_PROOF_SIZE[@]}")" + echo "p3_proof_size_medians=$(join_slash "${RESULT_P3_PROOF_SIZE[@]}")" + echo "lambda_peak_rss_medians=$(join_slash "${RESULT_LAMBDA_RSS[@]}")" + echo "p3_peak_rss_medians=$(join_slash "${RESULT_P3_RSS[@]}")" echo "ratios_lambda_over_p3=$(join_slash "${RESULT_RATIO[@]}")" } > "$REPORT_DIR/metrics.txt" fi diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs index cb58aea42..66d9baacd 100644 --- a/bench_vs_plonky3/src/bin/prove_bench.rs +++ b/bench_vs_plonky3/src/bin/prove_bench.rs @@ -1,17 +1,18 @@ //! Minimal wall-clock benchmark harness for Lambda STARK vs Plonky3. //! //! Builds the same Fibonacci AIR as `instruments_breakdown` (but without any -//! instrumentation) and prints a single line `Proving time: X.XXXs` to -//! stdout, suitable for parsing by `bench_vs_plonky3/run.sh`. +//! instrumentation) and prints human-readable timings plus one tab-separated +//! `METRICS` line, suitable for parsing by `bench_vs_plonky3/run.sh`. //! //! Usage: //! prove_bench --prover {lambda|p3} [--log-rows K] [--num-sequences N] -//! [--blowup B] [--queries Q] [--grinding G] +//! [--blowup B] [--queries Q] [--grinding G] [--breakdown] //! //! Defaults match production (`GoldilocksCubicProofOptions::with_blowup(2)`): //! log-rows=19, num-sequences=16, blowup=2, queries=219, grinding=0. use std::process::ExitCode; +use std::sync::{Arc, Mutex}; use std::time::Instant; use bench_vs_plonky3::{lambda_fibonacci_pair, plonky3_config, plonky3_fibonacci}; @@ -21,6 +22,8 @@ use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; use math::field::goldilocks::GoldilocksField; use stark::proof::options::ProofOptions; use stark::prover::{IsStarkProver, Prover}; +use stark::verifier::{IsStarkVerifier, Verifier}; +use tracing_subscriber::layer::SubscriberExt; type F = GoldilocksField; type E = Degree3GoldilocksExtensionField; @@ -39,6 +42,14 @@ struct Args { blowup: u8, queries: usize, grinding: u8, + breakdown: bool, +} + +struct BenchMetrics { + prove_s: f64, + verify_s: f64, + proof_size_bytes: usize, + peak_rss_kb: Option, } impl Default for Args { @@ -50,6 +61,7 @@ impl Default for Args { blowup: 2, queries: 219, grinding: 0, + breakdown: false, } } } @@ -58,7 +70,7 @@ fn print_usage() { eprintln!( "usage: prove_bench --prover {{lambda|p3}} \ [--log-rows K] [--num-sequences N] \ - [--blowup B] [--queries Q] [--grinding G]" + [--blowup B] [--queries Q] [--grinding G] [--breakdown]" ); } @@ -97,6 +109,9 @@ fn parse_args() -> Result { let v = iter.next().ok_or("--grinding needs a value")?; args.grinding = v.parse().map_err(|_| "--grinding: invalid u8")?; } + "--breakdown" => { + args.breakdown = true; + } "-h" | "--help" => { print_usage(); std::process::exit(0); @@ -113,6 +128,12 @@ fn parse_args() -> Result { if args.num_sequences == 0 { return Err("--num-sequences must be > 0".into()); } + if !args.blowup.is_power_of_two() { + return Err("--blowup must be a power of two".into()); + } + if args.queries == 0 { + return Err("--queries must be > 0".into()); + } Ok(args) } @@ -125,7 +146,282 @@ fn proof_options(args: &Args) -> ProofOptions { } } -fn run_lambda(args: &Args) -> std::time::Duration { +fn ms(seconds: f64) -> f64 { + seconds * 1000.0 +} + +fn print_breakdown( + prover: &str, + log_rows: u32, + rows: usize, + phase: &str, + elapsed_ms: f64, + extra: &str, +) { + println!( + "BREAKDOWN\tworkload=fib_pair\tprover={prover}\tlog_rows={log_rows}\trows={rows}\tphase={phase}\tms={elapsed_ms:.3}{extra}" + ); +} + +#[cfg(feature = "instruments")] +fn emit_lambda_breakdown(args: &Args, rows: usize, total_ms: f64) { + print_breakdown("lambda", args.log_rows, rows, "prove_total", total_ms, ""); + + if let Some(timing) = stark::instruments::take() { + print_breakdown( + "lambda", + args.log_rows, + rows, + "prepass", + ms(timing.prepass.as_secs_f64()), + "", + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "main_commits", + ms(timing.main_commits.as_secs_f64()), + "", + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "aux_build", + ms(timing.aux_build.as_secs_f64()), + "", + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "aux_commit", + ms(timing.aux_commit.as_secs_f64()), + "", + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "rounds_2_4", + ms(timing.rounds_2_4.as_secs_f64()), + "", + ); + + let r1 = timing.round1_sub; + print_breakdown( + "lambda", + args.log_rows, + rows, + "r1_main_lde", + ms(r1.main_lde.as_secs_f64()), + "", + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r1_main_merkle", + ms(r1.main_merkle.as_secs_f64()), + "", + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r1_aux_lde", + ms(r1.aux_lde.as_secs_f64()), + "", + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r1_aux_merkle", + ms(r1.aux_merkle.as_secs_f64()), + "", + ); + + for (name, table_rows, dur, sub) in timing.table_timings { + let extra = format!("\ttable={name}\ttable_rows={table_rows}"); + print_breakdown( + "lambda", + args.log_rows, + rows, + "table_total", + ms(dur.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r2_constraints", + ms(sub.constraints.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r2_comp_decompose", + ms(sub.comp_decompose.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r2_comp_commit", + ms(sub.comp_commit.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r3_ood", + ms(sub.ood.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r4_deep_comp", + ms(sub.deep_comp.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r4_deep_extend", + ms(sub.deep_extend.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r4_fri_commit", + ms(sub.fri_commit.as_secs_f64()), + &extra, + ); + print_breakdown( + "lambda", + args.log_rows, + rows, + "r4_queries", + ms(sub.queries.as_secs_f64()), + &extra, + ); + } + } +} + +#[cfg(not(feature = "instruments"))] +fn emit_lambda_breakdown(args: &Args, rows: usize, total_ms: f64) { + print_breakdown("lambda", args.log_rows, rows, "prove_total", total_ms, ""); + eprintln!("warning: Lambda phase breakdown requires building with --features instruments"); +} + +struct SpanState { + name: String, + active_since: Option, + accumulated: std::time::Duration, +} + +struct P3TimingLayer { + spans: Mutex>, + results: Arc>>, +} + +impl tracing_subscriber::registry::LookupSpan<'lookup>> + tracing_subscriber::Layer for P3TimingLayer +{ + fn on_new_span( + &self, + attrs: &tracing::span::Attributes<'_>, + id: &tracing::span::Id, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + self.spans.lock().unwrap().insert( + id.into_u64(), + SpanState { + name: attrs.metadata().name().to_string(), + active_since: None, + accumulated: std::time::Duration::ZERO, + }, + ); + } + + fn on_enter(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { + if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) + && entry.active_since.is_none() + { + entry.active_since = Some(Instant::now()); + } + } + + fn on_exit(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { + if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) + && let Some(start) = entry.active_since.take() + { + entry.accumulated += start.elapsed(); + } + } + + fn on_close(&self, id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { + if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) { + let mut total = entry.accumulated; + if let Some(start) = entry.active_since { + total += start.elapsed(); + } + self.results + .lock() + .unwrap() + .push((entry.name, ms(total.as_secs_f64()))); + } + } +} + +type P3SpanResults = Arc>>; + +fn p3_span_subscriber() -> (impl tracing::Subscriber + Send + Sync, P3SpanResults) { + let results = Arc::new(Mutex::new(Vec::new())); + let layer = P3TimingLayer { + spans: Mutex::new(std::collections::HashMap::new()), + results: Arc::clone(&results), + }; + let filter = tracing_subscriber::filter::LevelFilter::DEBUG; + ( + tracing_subscriber::registry().with(filter).with(layer), + results, + ) +} + +fn peak_rss_kb() -> Option { + let mut usage = std::mem::MaybeUninit::::uninit(); + // SAFETY: getrusage initializes `usage` when it returns 0. + let rc = unsafe { libc::getrusage(libc::RUSAGE_SELF, usage.as_mut_ptr()) }; + if rc != 0 { + return None; + } + + let maxrss = unsafe { usage.assume_init().ru_maxrss }; + #[cfg(target_os = "macos")] + { + Some((maxrss as u64).div_ceil(1024)) + } + #[cfg(not(target_os = "macos"))] + { + Some(maxrss as u64) + } +} + +fn run_lambda(args: &Args) -> BenchMetrics { let rows = 1usize << args.log_rows; let options = proof_options(args); @@ -148,21 +444,80 @@ fn run_lambda(args: &Args) -> std::time::Duration { &mut DefaultTranscript::::new(&[]), ) .expect("lambda prove failed"); - start.elapsed() + let prove_s = start.elapsed().as_secs_f64(); + if args.breakdown { + emit_lambda_breakdown(args, rows, ms(prove_s)); + } + + let proof_size_bytes = serde_cbor::to_vec(&_proof) + .expect("lambda proof serialization failed") + .len(); + + let start = Instant::now(); + let verified = + Verifier::::verify(&_proof, &air, &mut DefaultTranscript::::new(&[])); + let verify_s = start.elapsed().as_secs_f64(); + assert!(verified, "lambda verify failed"); + + BenchMetrics { + prove_s, + verify_s, + proof_size_bytes, + peak_rss_kb: peak_rss_kb(), + } } -fn run_p3(args: &Args) -> std::time::Duration { +fn run_p3(args: &Args) -> BenchMetrics { let rows = 1usize << args.log_rows; - let config = plonky3_config::matched_params_config(); + let config = plonky3_config::params_config(args.blowup, args.queries, args.grinding); let air = plonky3_fibonacci::P3FibonacciAir { num_sequences: args.num_sequences, }; let trace = plonky3_fibonacci::generate_fibonacci_trace(args.num_sequences, rows); let pis = plonky3_fibonacci::public_values(args.num_sequences); + let (prove_s, _proof, span_results) = if args.breakdown { + let (subscriber, results) = p3_span_subscriber(); + let start = Instant::now(); + let proof = { + let _guard = tracing::subscriber::set_default(subscriber); + p3_uni_stark::prove(&config, &air, trace, &pis) + }; + (start.elapsed().as_secs_f64(), proof, Some(results)) + } else { + let start = Instant::now(); + let proof = p3_uni_stark::prove(&config, &air, trace, &pis); + (start.elapsed().as_secs_f64(), proof, None) + }; + + if args.breakdown { + print_breakdown("p3", args.log_rows, rows, "prove_total", ms(prove_s), ""); + if let Some(results) = span_results { + let mut span_data = results.lock().unwrap().clone(); + span_data.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + for (name, elapsed_ms) in span_data { + if elapsed_ms >= 0.1 { + let extra = format!("\tspan={name}"); + print_breakdown("p3", args.log_rows, rows, "span", elapsed_ms, &extra); + } + } + } + } + + let proof_size_bytes = serde_cbor::to_vec(&_proof) + .expect("p3 proof serialization failed") + .len(); + let start = Instant::now(); - let _proof = p3_uni_stark::prove(&config, &air, trace, &pis); - start.elapsed() + p3_uni_stark::verify(&config, &air, &_proof, &pis).expect("p3 verify failed"); + let verify_s = start.elapsed().as_secs_f64(); + + BenchMetrics { + prove_s, + verify_s, + proof_size_bytes, + peak_rss_kb: peak_rss_kb(), + } } fn main() -> ExitCode { @@ -175,11 +530,46 @@ fn main() -> ExitCode { } }; - let elapsed = match args.prover { + let metrics = match args.prover { ProverKind::Lambda => run_lambda(&args), ProverKind::P3 => run_p3(&args), }; - println!("Proving time: {:.3}s", elapsed.as_secs_f64()); + let prover_name = match args.prover { + ProverKind::Lambda => "lambda", + ProverKind::P3 => "p3", + }; + let rows = 1usize << args.log_rows; + let main_cols = 2 * args.num_sequences; + let aux_cols = 0usize; + let cells = rows * main_cols; + let rows_per_sec = rows as f64 / metrics.prove_s; + let cells_per_sec = cells as f64 / metrics.prove_s; + let peak_rss_kb = metrics + .peak_rss_kb + .map(|v| v.to_string()) + .unwrap_or_else(|| "n/a".to_string()); + + println!("Proving time: {:.6}s", metrics.prove_s); + println!("Verification time: {:.6}s", metrics.verify_s); + println!("Proof size: {} bytes", metrics.proof_size_bytes); + println!("Peak RSS: {peak_rss_kb} KB"); + println!( + "METRICS\tworkload=fib_pair\tprover={prover_name}\tlog_rows={}\trows={rows}\t\ + num_sequences={}\tmain_cols={main_cols}\taux_cols={aux_cols}\ttables=1\t\ + logup=false\tblowup={}\tfri_queries={}\tgrinding={}\tprove_s={:.6}\t\ + verify_s={:.6}\tproof_size_bytes={}\tpeak_rss_kb={peak_rss_kb}\t\ + rows_per_sec={:.3}\tcells_per_sec={:.3}", + args.log_rows, + args.num_sequences, + args.blowup, + args.queries, + args.grinding, + metrics.prove_s, + metrics.verify_s, + metrics.proof_size_bytes, + rows_per_sec, + cells_per_sec, + ); ExitCode::SUCCESS } diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs index 751e86855..9c1ca6024 100644 --- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs +++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs @@ -180,7 +180,8 @@ where } /// Public inputs: initial `(a, b) = (left, right)` pair for each sequence. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] +#[serde(bound = "")] pub struct FibonacciPairPublicInputs { pub initial_values: Vec<(FieldElement, FieldElement)>, } @@ -209,6 +210,10 @@ where 1 } + fn name(&self) -> &str { + "fib_pair" + } + fn new(proof_options: &ProofOptions) -> Self { Self::with_num_sequences(proof_options, 2) } diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs index 971660f37..a266cac04 100644 --- a/bench_vs_plonky3/src/plonky3_config.rs +++ b/bench_vs_plonky3/src/plonky3_config.rs @@ -45,22 +45,25 @@ fn build_mmcs() -> (ValMmcs, ChallengeMmcs, ByteHash) { (val_mmcs, challenge_mmcs, byte_hash) } -/// Creates a Plonky3 STARK config with parameters matched to Lambda's -/// production config `GoldilocksCubicProofOptions::with_blowup(2)`: -/// blowup=2, 219 FRI queries, grinding=0 (excluded from benchmark). -pub fn matched_params_config() -> P3Config { +/// Creates a Plonky3 STARK config with parameters matched to Lambda's proof +/// options. `blowup` must be a power of two because Plonky3 stores it as +/// `log_blowup`. +pub fn params_config(blowup: u8, queries: usize, grinding: u8) -> P3Config { + assert!( + blowup.is_power_of_two(), + "blowup must be a power of two for Plonky3" + ); + let (val_mmcs, challenge_mmcs, byte_hash) = build_mmcs(); let dft = Dft::default(); let challenger = Challenger::from_hasher(vec![], byte_hash); - // Match Lambda production: blowup=2, queries=219, grinding=0. - // Grinding excluded from benchmark (identical PoW on both sides). let fri_params = FriParameters { - log_blowup: 1, // blowup = 2 + log_blowup: blowup.trailing_zeros() as usize, log_final_poly_len: 0, max_log_arity: 1, - num_queries: 219, - commit_proof_of_work_bits: 0, + num_queries: queries, + commit_proof_of_work_bits: grinding as usize, query_proof_of_work_bits: 0, mmcs: challenge_mmcs, }; @@ -68,3 +71,10 @@ pub fn matched_params_config() -> P3Config { let pcs = Pcs::new(dft, val_mmcs, fri_params); P3Config::new(pcs, challenger) } + +/// Creates a Plonky3 STARK config with parameters matched to Lambda's +/// production config `GoldilocksCubicProofOptions::with_blowup(2)`: +/// blowup=2, 219 FRI queries, grinding=0. +pub fn matched_params_config() -> P3Config { + params_config(2, 219, 0) +} From 6b57545b88ef461b0f37dd6422a920bcd40b2b91 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 12 May 2026 18:34:45 -0300 Subject: [PATCH 28/34] Add Lambda-vs-Plonky3 sections to the nightly Slack post --- .github/scripts/publish_bench_vs.sh | 121 +++++++++++++++++++++- .github/workflows/bench-vs-nightly.yml | 11 ++ .github/workflows/bench-vs-p3-nightly.yml | 47 --------- bench_vs_plonky3/Cargo.toml | 38 +++---- bench_vs_plonky3/run_p3_nightly.sh | 51 +++++++++ bench_vs_plonky3/src/plonky3_config.rs | 8 +- 6 files changed, 198 insertions(+), 78 deletions(-) delete mode 100644 .github/workflows/bench-vs-p3-nightly.yml create mode 100755 bench_vs_plonky3/run_p3_nightly.sh diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh index 4408c17c0..f30dcce52 100644 --- a/.github/scripts/publish_bench_vs.sh +++ b/.github/scripts/publish_bench_vs.sh @@ -79,6 +79,125 @@ if [ -n "$LAMBDA_PROJECTED_H" ] || [ -n "$SP1_PROJECTED_H" ]; then PROJ_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Linear Projection"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$PROJ_MRKDWN"'"}}' fi +# --- Plonky3 section (optional) -------------------------------------------- +# Built when `bench_vs_artifacts/p3/headline/metrics.txt` exists. The headline +# row comes from that file; column-scaling rows are read from the per-N +# subdirs written by the workflow. + +p3_parse() { + local file=$1 + local key=$2 + { grep "^${key}=" "$file" 2>/dev/null || true; } | cut -d= -f2- +} + +p3_fmt_seconds() { + LC_NUMERIC=C awk -v s="$1" 'BEGIN { + if (s == "") { print "n/a"; exit } + if (s + 0 < 1) printf "%.0fms", s * 1000 + else printf "%.3fs", s + }' +} + +p3_fmt_mb() { + LC_NUMERIC=C awk -v b="$1" 'BEGIN { + if (b == "") { print "n/a"; exit } + printf "%.1f MB", b / (1024 * 1024) + }' +} + +p3_fmt_gb() { + LC_NUMERIC=C awk -v kb="$1" 'BEGIN { + if (kb == "") { print "n/a"; exit } + printf "%.2f GB", kb / (1024 * 1024) + }' +} + +p3_fmt_ratio_pair() { + LC_NUMERIC=C awk -v a="$1" -v b="$2" 'BEGIN { + if (a == "" || b == "" || b + 0 == 0) { print "n/a"; exit } + printf "%.2fx", a / b + }' +} + +P3_SECTION="" +P3_HEADLINE_FILE="bench_vs_artifacts/p3/headline/metrics.txt" +if [ -f "$P3_HEADLINE_FILE" ]; then + H_LOG_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "log_rows_series") + H_COLS=$(p3_parse "$P3_HEADLINE_FILE" "columns") + H_BLOWUP=$(p3_parse "$P3_HEADLINE_FILE" "blowup") + H_QUERIES=$(p3_parse "$P3_HEADLINE_FILE" "fri_queries") + H_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "rows_series") + H_LAMBDA_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "lambda_prove_medians") + H_P3_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "p3_prove_medians") + H_LAMBDA_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "lambda_verify_medians") + H_P3_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "p3_verify_medians") + H_LAMBDA_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "lambda_proof_size_medians") + H_P3_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "p3_proof_size_medians") + H_LAMBDA_RSS=$(p3_parse "$P3_HEADLINE_FILE" "lambda_peak_rss_medians") + H_P3_RSS=$(p3_parse "$P3_HEADLINE_FILE" "p3_peak_rss_medians") + H_RATIO=$(p3_parse "$P3_HEADLINE_FILE" "ratios_lambda_over_p3") + + H_ROWS_FMT=$(LC_NUMERIC=C awk -v r="$H_ROWS" 'BEGIN { + if (r == "") { print "n/a"; exit } + if (r + 0 >= 1000000) printf "%.1fM", r / 1000000 + else if (r + 0 >= 1000) printf "%.0fK", r / 1000 + else printf "%d", r + }') + + PROOF_RATIO=$(p3_fmt_ratio_pair "$H_LAMBDA_PROOF" "$H_P3_PROOF") + RSS_RATIO=$(p3_fmt_ratio_pair "$H_LAMBDA_RSS" "$H_P3_RSS") + PROVE_RATIO_FMT=$(LC_NUMERIC=C awk -v r="$H_RATIO" 'BEGIN { + if (r == "" || r == "n/a") { print "n/a"; exit } + printf "%.2fx", r + }') + + P3_HEADLINE_MRKDWN="*log_rows=${H_LOG_ROWS} (${H_ROWS_FMT} rows · ${H_COLS} cols · blowup=${H_BLOWUP} · ${H_QUERIES} queries)*" + P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Lambda:* $(p3_fmt_seconds "$H_LAMBDA_PROVE") prove · $(p3_fmt_seconds "$H_LAMBDA_VERIFY") verify · $(p3_fmt_mb "$H_LAMBDA_PROOF") proof · $(p3_fmt_gb "$H_LAMBDA_RSS") RSS" + P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS" + P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS" + + # Render a `(label|file)` list into a multi-line mrkdwn block with + # `*label* Lambda Xs / P3 Ys — Rx` per row. Used by both sweep sections. + p3_render_sweep() { + local out="" + local entry label file lambda_t p3_t ratio ratio_fmt line + for entry in "$@"; do + label="${entry%%|*}" + file="${entry##*|}" + if [ ! -f "$file" ]; then + line="*${label}* (no data)" + else + lambda_t=$(p3_parse "$file" "lambda_prove_medians") + p3_t=$(p3_parse "$file" "p3_prove_medians") + ratio=$(p3_parse "$file" "ratios_lambda_over_p3") + ratio_fmt=$(LC_NUMERIC=C awk -v r="$ratio" 'BEGIN { + if (r == "" || r == "n/a") { print "n/a"; exit } + printf "%.2fx", r + }') + line="*${label}* Lambda $(p3_fmt_seconds "$lambda_t") / P3 $(p3_fmt_seconds "$p3_t") — ${ratio_fmt}" + fi + if [ -n "$out" ]; then + out="${out}\\n${line}" + else + out="$line" + fi + done + printf '%s' "$out" + } + + P3_SIZE_MRKDWN=$(p3_render_sweep \ + "log_rows=19|bench_vs_artifacts/p3/size_log19/metrics.txt" \ + "log_rows=20|bench_vs_artifacts/p3/size_log20/metrics.txt" \ + "log_rows=21|bench_vs_artifacts/p3/headline/metrics.txt") + + P3_COLS_MRKDWN=$(p3_render_sweep \ + "8 cols (n=4):|bench_vs_artifacts/p3/cols_n4/metrics.txt" \ + "32 cols (n=16):|bench_vs_artifacts/p3/headline/metrics.txt" \ + "128 cols (n=64):|bench_vs_artifacts/p3/cols_n64/metrics.txt") + + P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Size scaling @ 32 cols"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_SIZE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Column scaling @ log_rows='"$H_LOG_ROWS"'"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_COLS_MRKDWN"'"}}' +fi + curl -X POST "$WEBHOOK_URL" \ -H 'Content-Type: application/json; charset=utf-8' \ - --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"divider"},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION"']}' + --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"divider"},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION$P3_SECTION"']}' diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml index 2118632f8..f27cdf356 100644 --- a/.github/workflows/bench-vs-nightly.yml +++ b/.github/workflows/bench-vs-nightly.yml @@ -47,6 +47,17 @@ jobs: --report-dir bench_vs_artifacts \ --no-color + - name: Refresh Plonky3 to latest main + run: | + cargo update --manifest-path bench_vs_plonky3/Cargo.toml \ + -p p3-air -p p3-field -p p3-goldilocks -p p3-matrix \ + -p p3-commit -p p3-challenger -p p3-symmetric \ + -p p3-merkle-tree -p p3-keccak -p p3-fri \ + -p p3-uni-stark -p p3-dft + + - name: Run Plonky3 nightly benchmark + run: bash ./bench_vs_plonky3/run_p3_nightly.sh bench_vs_artifacts/p3 + - name: Upload nightly benchmark artifact uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/bench-vs-p3-nightly.yml b/.github/workflows/bench-vs-p3-nightly.yml deleted file mode 100644 index 03fedad2b..000000000 --- a/.github/workflows/bench-vs-p3-nightly.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Bench Vs Plonky3 Nightly - -on: - schedule: - # 04:30 America/Argentina/Buenos_Aires = 07:30 UTC - # SP1 nightly fires at 06:00 UTC (03:00 BA) and runs ~1.5h; scheduling 1.5h - # later leaves the self-hosted bench runner free. - - cron: "30 7 * * *" - workflow_dispatch: - -permissions: - contents: read - -concurrency: - group: bench-vs-p3-nightly-${{ github.ref }} - cancel-in-progress: true - -jobs: - bench-vs-p3: - runs-on: [self-hosted, bench] - timeout-minutes: 60 - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Rust Environment - uses: ./.github/actions/setup-rust - - - name: Add cargo to PATH - run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - - - name: Run nightly Plonky3 benchmark - run: | - bash ./bench_vs_plonky3/run.sh \ - --log-rows 19 \ - --num-sequences 16 \ - --runs 10 \ - --scalar \ - --report-dir bench_vs_p3_artifacts \ - --no-color - - - name: Upload nightly benchmark artifact - uses: actions/upload-artifact@v4 - with: - name: bench-vs-p3-nightly-${{ github.run_number }}-${{ github.sha }} - path: bench_vs_p3_artifacts - retention-days: 90 diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml index 92deaa31c..8fef10667 100644 --- a/bench_vs_plonky3/Cargo.toml +++ b/bench_vs_plonky3/Cargo.toml @@ -12,28 +12,18 @@ math = { path = "../crypto/math", features = [ "lambdaworks-serde-binary", ] } -# Plonky3: pinned to the yetanotherco fork, branch `feat/goldilocks_deg3`. -# The branch adds BinomiallyExtendable<3> for Goldilocks (x^3 - 2), matching -# Lambda's Degree3GoldilocksExtensionField. All p3-* crates MUST resolve to -# the same git source + ref; declaring any of them as a crates.io dep would -# pull in a second incompatible p3-field. cargo clones the fork once into -# ~/.cargo/git/db and Cargo.lock pins the branch HEAD at fetch time. -p3-air = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-field = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-goldilocks = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-matrix = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-commit = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-challenger = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-symmetric = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-merkle-tree = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-keccak = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-fri = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3" } -p3-uni-stark = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ - "parallel", -] } -p3-dft = { git = "https://github.com/yetanotherco/Plonky3.git", branch = "feat/goldilocks_deg3", features = [ - "parallel", -] } +p3-air = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-field = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-commit = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-fri = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] } +p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] } # Tracing for P3 span-based profiling tracing = "0.1" @@ -49,8 +39,8 @@ criterion = { version = "0.4", default-features = false } # Both provers run multi-threaded by default: Plonky3's `Radix2DitParallel` DFT # uses rayon unconditionally, so Lambda must also enable `parallel` for a fair # apples-to-apples comparison. Disable with `--no-default-features` to compare -# single-threaded. Cubic extension (`x^3 - 2`) matching Lambda is unconditional -# — the fork ships `BinomiallyExtendable<3>` for Goldilocks natively. +# single-threaded. The cubic extension is `x^3 - 2` (binomial) on Lambda and +# `x^3 - x - 1` (trinomial) on upstream Plonky3 — same degree, same soundness. default = ["parallel"] parallel = ["stark/parallel"] instruments = ["stark/instruments"] diff --git a/bench_vs_plonky3/run_p3_nightly.sh b/bench_vs_plonky3/run_p3_nightly.sh new file mode 100755 index 000000000..495b0ed58 --- /dev/null +++ b/bench_vs_plonky3/run_p3_nightly.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Orchestrates the Lambda-vs-Plonky3 nightly benchmark. +# +# Runs 5 configurations of run.sh into separate report-dirs under +# `$REPORT_BASE`. The same 5 dirs are consumed by +# `.github/scripts/publish_bench_vs.sh` to render the 3-section Slack post +# (Headline + Size scaling + Column scaling). +# +# Usage: +# ./bench_vs_plonky3/run_p3_nightly.sh [REPORT_BASE] +# +# Defaults: REPORT_BASE=bench_vs_artifacts/p3 +# +# Each run is 10 iterations × 2 provers; the 5 runs together take ~3 min on +# the bench server. + +set -euo pipefail + +REPORT_BASE="${1:-bench_vs_artifacts/p3}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RUN_SH="$SCRIPT_DIR/run.sh" + +if [ ! -x "$RUN_SH" ]; then + echo "run.sh not found or not executable at $RUN_SH" >&2 + exit 1 +fi + +run_one() { + local label=$1 + local log_rows=$2 + local num_sequences=$3 + local out_dir="$REPORT_BASE/$label" + echo + echo "=== ${label} (log_rows=${log_rows}, num_sequences=${num_sequences}) ===" + bash "$RUN_SH" \ + --log-rows "$log_rows" \ + --num-sequences "$num_sequences" \ + --runs 10 \ + --scalar \ + --report-dir "$out_dir" \ + --no-color +} + +# Size sweep + headline (32 cols). +run_one size_log19 19 16 +run_one size_log20 20 16 +run_one headline 21 16 + +# Column sweep @ log_rows=21. +run_one cols_n4 21 4 +run_one cols_n64 21 64 diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs index a266cac04..774009bba 100644 --- a/bench_vs_plonky3/src/plonky3_config.rs +++ b/bench_vs_plonky3/src/plonky3_config.rs @@ -1,7 +1,7 @@ use p3_challenger::{HashChallenger, SerializingChallenger64}; use p3_commit::ExtensionMmcs; use p3_dft::Radix2DitParallel; -use p3_field::extension::BinomialExtensionField; +use p3_field::extension::CubicTrinomialExtensionField; use p3_fri::{FriParameters, TwoAdicFriPcs}; use p3_goldilocks::Goldilocks; use p3_keccak::{Keccak256Hash, KeccakF}; @@ -10,11 +10,7 @@ use p3_symmetric::{CompressionFunctionFromHasher, PaddingFreeSponge, Serializing use p3_uni_stark::StarkConfig; pub type Val = Goldilocks; - -/// Cubic extension matching Lambda's `Degree3GoldilocksExtensionField` -/// (irreducible x^3 - 2). Provided by the forked `p3-goldilocks` via -/// `BinomiallyExtendable<3>`. -pub type Challenge = BinomialExtensionField; +pub type Challenge = CubicTrinomialExtensionField; type ByteHash = Keccak256Hash; type U64Hash = PaddingFreeSponge; From 82db83b0e760ba82bfd98bab30d39d93ba1c1af1 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Mon, 18 May 2026 16:21:49 -0300 Subject: [PATCH 29/34] Trim Lambda-vs-Plonky3 nightly to headline-only (drop size and column sweeps) --- .github/scripts/publish_bench_vs.sh | 45 ++--------------------------- bench_vs_plonky3/run_p3_nightly.sh | 20 ++++--------- 2 files changed, 7 insertions(+), 58 deletions(-) diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh index f30dcce52..5e6128bfb 100644 --- a/.github/scripts/publish_bench_vs.sh +++ b/.github/scripts/publish_bench_vs.sh @@ -80,9 +80,7 @@ if [ -n "$LAMBDA_PROJECTED_H" ] || [ -n "$SP1_PROJECTED_H" ]; then fi # --- Plonky3 section (optional) -------------------------------------------- -# Built when `bench_vs_artifacts/p3/headline/metrics.txt` exists. The headline -# row comes from that file; column-scaling rows are read from the per-N -# subdirs written by the workflow. +# Built when `bench_vs_artifacts/p3/headline/metrics.txt` exists. p3_parse() { local file=$1 @@ -156,46 +154,7 @@ if [ -f "$P3_HEADLINE_FILE" ]; then P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS" P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS" - # Render a `(label|file)` list into a multi-line mrkdwn block with - # `*label* Lambda Xs / P3 Ys — Rx` per row. Used by both sweep sections. - p3_render_sweep() { - local out="" - local entry label file lambda_t p3_t ratio ratio_fmt line - for entry in "$@"; do - label="${entry%%|*}" - file="${entry##*|}" - if [ ! -f "$file" ]; then - line="*${label}* (no data)" - else - lambda_t=$(p3_parse "$file" "lambda_prove_medians") - p3_t=$(p3_parse "$file" "p3_prove_medians") - ratio=$(p3_parse "$file" "ratios_lambda_over_p3") - ratio_fmt=$(LC_NUMERIC=C awk -v r="$ratio" 'BEGIN { - if (r == "" || r == "n/a") { print "n/a"; exit } - printf "%.2fx", r - }') - line="*${label}* Lambda $(p3_fmt_seconds "$lambda_t") / P3 $(p3_fmt_seconds "$p3_t") — ${ratio_fmt}" - fi - if [ -n "$out" ]; then - out="${out}\\n${line}" - else - out="$line" - fi - done - printf '%s' "$out" - } - - P3_SIZE_MRKDWN=$(p3_render_sweep \ - "log_rows=19|bench_vs_artifacts/p3/size_log19/metrics.txt" \ - "log_rows=20|bench_vs_artifacts/p3/size_log20/metrics.txt" \ - "log_rows=21|bench_vs_artifacts/p3/headline/metrics.txt") - - P3_COLS_MRKDWN=$(p3_render_sweep \ - "8 cols (n=4):|bench_vs_artifacts/p3/cols_n4/metrics.txt" \ - "32 cols (n=16):|bench_vs_artifacts/p3/headline/metrics.txt" \ - "128 cols (n=64):|bench_vs_artifacts/p3/cols_n64/metrics.txt") - - P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Size scaling @ 32 cols"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_SIZE_MRKDWN"'"}},{"type":"header","text":{"type":"plain_text","text":"Column scaling @ log_rows='"$H_LOG_ROWS"'"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_COLS_MRKDWN"'"}}' + P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}}' fi curl -X POST "$WEBHOOK_URL" \ diff --git a/bench_vs_plonky3/run_p3_nightly.sh b/bench_vs_plonky3/run_p3_nightly.sh index 495b0ed58..47957b6ff 100755 --- a/bench_vs_plonky3/run_p3_nightly.sh +++ b/bench_vs_plonky3/run_p3_nightly.sh @@ -1,18 +1,15 @@ #!/usr/bin/env bash # Orchestrates the Lambda-vs-Plonky3 nightly benchmark. # -# Runs 5 configurations of run.sh into separate report-dirs under -# `$REPORT_BASE`. The same 5 dirs are consumed by -# `.github/scripts/publish_bench_vs.sh` to render the 3-section Slack post -# (Headline + Size scaling + Column scaling). +# Runs the headline configuration (log_rows=21, num_sequences=16 → 32 cols) +# into `$REPORT_BASE/headline/`. Consumed by +# `.github/scripts/publish_bench_vs.sh` to render the Headline section of +# the Slack post. # # Usage: # ./bench_vs_plonky3/run_p3_nightly.sh [REPORT_BASE] # # Defaults: REPORT_BASE=bench_vs_artifacts/p3 -# -# Each run is 10 iterations × 2 provers; the 5 runs together take ~3 min on -# the bench server. set -euo pipefail @@ -41,11 +38,4 @@ run_one() { --no-color } -# Size sweep + headline (32 cols). -run_one size_log19 19 16 -run_one size_log20 20 16 -run_one headline 21 16 - -# Column sweep @ log_rows=21. -run_one cols_n4 21 4 -run_one cols_n64 21 64 +run_one headline 21 16 From c84a12cce896d5c062a03b24ceab9e65e98a6cfc Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 19 May 2026 10:06:51 -0300 Subject: [PATCH 30/34] fix bench --- bench_vs_plonky3/Cargo.toml | 27 +++--- bench_vs_plonky3/README.md | 84 +++++++++++++++---- bench_vs_plonky3/run.sh | 48 +++++++++-- bench_vs_plonky3/src/bin/prove_bench.rs | 56 +++++++++++++ bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 38 +++++++++ bench_vs_plonky3/src/plonky3_config.rs | 38 +++++---- 6 files changed, 237 insertions(+), 54 deletions(-) diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml index 8fef10667..26bb49cc1 100644 --- a/bench_vs_plonky3/Cargo.toml +++ b/bench_vs_plonky3/Cargo.toml @@ -12,18 +12,21 @@ math = { path = "../crypto/math", features = [ "lambdaworks-serde-binary", ] } -p3-air = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-field = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-commit = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-fri = { git = "https://github.com/Plonky3/Plonky3.git" } -p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] } -p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] } +# Pinned to the commit currently resolved in Cargo.lock so the benchmark is +# reproducible against an exact P3 revision. Bumping is fine; it must be an +# explicit decision, not the result of an unrelated `cargo update`. +p3-air = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-field = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-commit = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-fri = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } +p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] } +p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] } # Tracing for P3 span-based profiling tracing = "0.1" diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md index 066582280..a78249f31 100644 --- a/bench_vs_plonky3/README.md +++ b/bench_vs_plonky3/README.md @@ -35,7 +35,7 @@ test. ## Usage ```bash -# Default: log-rows=19, num-sequences=16, runs=3, cubic extension, no scalar +# Default: log-rows=19, num-sequences=16, runs=10, cubic extension, no scalar ./bench_vs_plonky3/run.sh # Size sweep @@ -58,10 +58,10 @@ test. |---|---|---| | `--log-rows K [K ...]` | `19` | One or more power-of-2 row counts. | | `--num-sequences N` | `16` | Number of Fibonacci sequences (columns = `2 × N`). | -| `--runs N` | `3` | Runs per `(size, prover)`; median is reported. | +| `--runs N` | `10` | Runs per `(size, prover)`; median + CV are reported. | | `--lambda-only` / `--p3-only` | both | Restrict to a single prover. | -| `--report-dir DIR` | — | Write TSV + metrics + raw stdouts. | -| `--scalar` | off | Pin `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` so Goldilocks (and most of Keccak) run scalar on both sides. x86_64 only; on other archs the flag is ignored with a warning. Residual SSE2 on `p3-keccak` remains (~7% of total prove time). | +| `--report-dir DIR` | — | Write TSV + metrics + raw stdouts + raw audits. | +| `--scalar` | off | Pin `RUSTFLAGS="-C target-feature=-avx2,-avx512f"` so Goldilocks field arithmetic runs scalar on both sides. x86_64 only; on other archs the flag is ignored with a warning. The MMCS is already scalar regardless of this flag (see [P3 config: scalar MMCS](#p3-config-scalar-mmcs)). | | `--no-color` | off | Disable ANSI colors. | | `-h` / `--help` | — | Print usage. | @@ -73,8 +73,10 @@ Stdout (without `--report-dir`): === STARK prove benchmark: Lambda vs Plonky3 === log-rows: 19 num-sequences: 16 (columns = 32) - runs/size: 3 (median reported) - p3 extension: degree 3 (forked p3-goldilocks, matches Lambda) + runs/size: 10 (median + CV reported) + p3 extension: upstream CubicTrinomialExtensionField (x^3 - x - 1) + p3 mmcs: scalar Keccak256 (val_packing_width=1, hash_lanes=1) + proof params: blowup=2, queries=219, grinding=0 scalar mode: on (arch=x86_64, RUSTFLAGS="-C target-feature=-avx2,-avx512f") [build] prove_bench @@ -95,9 +97,16 @@ With `--report-dir DIR` the script writes: - `results.tsv` — tab-separated raw data (`log_rows, rows, lambda_median_s, p3_median_s, ratio_lambda_over_p3, runs`). +- `raw_metrics.tsv` — one row per `(prover, log_rows, run)` with all + `METRICS` fields parsed out. +- `raw_audits.tsv` — one row per `(prover, log_rows, run)` with the AUDIT + line emitted by `prove_bench` before each prove call. Lets you confirm in + retrospect that `val_packing_width=1`, `hash_lanes=1`, + `base_transition_constraints=2×num_sequences`, etc. Don't trust a number + without skimming this file. - `metrics.txt` — key=value pairs with the config used (arch, scalar flag, - extension degree, blowup, queries, runs, rustflags) and the per-series - values slash-joined (so post-processing scripts can split easily). + extension, mmcs choice, blowup, queries, runs, rustflags) and the + per-series values slash-joined (so post-processing scripts can split easily). - `raw/` — per-invocation stdouts (`{prover}_log{K}_run{i}.stdout`). No markdown file is generated — the TSV is the single source of truth for @@ -156,18 +165,59 @@ cargo test -p bench-vs-plonky3 --features instruments --release -- \ The nightly does **not** activate this path — it would add ~1 % overhead and pollute the historical wall-clock numbers. +## P3 config: scalar MMCS + +`plonky3_config.rs` sets up the P3 stark config with a deliberately +**non-production** MMCS: + +```rust +type ByteHash = Keccak256Hash; // tiny_keccak scalar +type FieldHash = SerializingHasher; +type MyCompress = CompressionFunctionFromHasher; +pub type ValMmcs = MerkleTreeMmcs; +``` + +The Plonky3 default for Goldilocks MMCS uses `PaddingFreeSponge` with leaves `[Val; VECTOR_LEN]` and digests `[u64; VECTOR_LEN]`, +where `VECTOR_LEN` is set at compile-time per arch: NEON=2, AVX-512=8, +AVX2=4, SSE2=2, fallback=1. That gives Plonky3 a free `N×` Keccak speedup +on every Merkle node — which Lambda's `sha3::Keccak256` cannot exploit +because the Lambda MMCS hashes a single input at a time. + +The scalar config here makes both sides hash one input per Keccak call. +Both still use the **same Keccak-f[1600] permutation** (capacity 512, rate +1088, 256-bit output, Keccak-original 0x01 padding); the only thing +removed is data-parallel lanes on the P3 side. Consequence: the ratio +published by this bench is **apples-to-apples scalar**, not "Plonky3 as +shipped in production." If you want the production-realistic P3 number, +swap the MMCS back to the vector-lane variant from upstream's examples. + +On aarch64 with `feature="asm"` enabled in `crypto/crypto`, Lambda's +`sha3::Keccak256` uses ARMv8 SHA3 intrinsics, which speeds up *one* Keccak +call (no data parallelism). `tiny_keccak`'s `Keccak256Hash` on P3 is pure +Rust and gets no such acceleration. On x86_64 server, neither side has +that path, so the comparison is cleanest there. + ## Notes on fairness -- **Extension field**: Plonky3 runs `BinomialExtensionField` - with the same `x^3 - 2` irreducible as Lambda's - `Degree3GoldilocksExtensionField`. Both sides use the same cubic extension. +- **Extension field**: Plonky3 runs upstream `CubicTrinomialExtensionField` + over Goldilocks (`x^3 - x - 1`); Lambda runs `Degree3GoldilocksExtensionField` + (`x^3 - 2`). Both are degree-3 irreducible extensions of `GF(p)` with the + same field size and the same soundness. Cell-by-cell trace equivalence is + asserted by `lambda_pair_trace_matches_plonky3_trace`. - **Parallelism**: both provers are multi-threaded by default. Lambda pulls - rayon via `stark/parallel`; Plonky3 pulls rayon via - `p3-uni-stark` / `p3-dft` (hardcoded `features = ["parallel"]`, always on). -- **SIMD**: without `--scalar`, each side uses whatever target-features the - compiler decides from the host CPU. `--scalar` (x86_64 only) disables AVX2 - and AVX-512 so Goldilocks arithmetic is scalar on both sides. `p3-keccak`'s - SSE2 path on x86 is not disabled. + rayon via `stark/parallel`; Plonky3 pulls rayon via `p3-uni-stark` / + `p3-dft` (hardcoded `features = ["parallel"]`, always on). +- **SIMD**: the MMCS Keccak is scalar on both sides (see above). For + Goldilocks field arithmetic, without `--scalar` each side uses whatever + target-features the compiler decides from the host CPU. `--scalar` + (x86_64 only) disables AVX2 / AVX-512. +- **AIR base-field path**: the Lambda AIR overrides + `num_base_transition_constraints` and implements `evaluate_prover` so its + Fibonacci transition constraints are evaluated in the base field (F×E, + ≈3 muls/term) instead of the default extension path (E×E, ≈9 muls/term). + This matches what the production Lambda STARK does for all + domain-constraint AIRs. - **Queries / grinding**: same `blowup=2`, `queries=219`, `grinding=0` on both sides. Security models differ (Lambda: Johnson-bound, ~108 bits proven; P3: conjectured, 219 queries × 1 bit = 219 bits, capped at 192 by the diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index 0098fed33..20f1f6331 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -10,9 +10,10 @@ # Defaults: --log-rows 19, --num-sequences 16, --runs 10. # With multiple --log-rows values, prints one stats row per size. # -# --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks (and most of Keccak) -# run scalar; residual SSE2 in p3-keccak remains. Triggers a rebuild when -# toggling; subsequent runs with the same RUSTFLAGS are cached. +# --scalar: on x86_64 drops AVX2 / AVX-512 so Goldilocks runs scalar. The MMCS +# itself is already scalar (single-input tiny_keccak via Keccak256Hash) regardless +# of this flag — its SIMD lanes were removed in the config. Triggers a rebuild +# when toggling; subsequent runs with the same RUSTFLAGS are cached. set -euo pipefail @@ -123,10 +124,9 @@ if [ -n "$REPORT_DIR" ]; then fi # --- Scalar (no SIMD) toggle ------------------------------------------------ -# When --scalar is on, disable AVX2/AVX-512 so Goldilocks (and most of Keccak) -# run scalar for an apples-to-apples comparison against Lambda STARK. The -# residual SSE2 path on p3-keccak is intentionally left enabled — its -# contribution to total prove time is ~7%. +# When --scalar is on, disable AVX2/AVX-512 so Goldilocks field arithmetic runs +# scalar for an apples-to-apples comparison against Lambda STARK. The MMCS Keccak +# is already scalar regardless of this flag (see plonky3_config.rs). # Cargo caches per-RUSTFLAGS, so toggling scalar vs vector triggers a rebuild # on first use but is cached afterwards. SCALAR_RUSTFLAGS="" @@ -155,7 +155,8 @@ echo -e "${BOLD}=== STARK prove benchmark: Lambda vs Plonky3 ===${NC}" echo -e " log-rows: ${YELLOW}${LOG_ROWS[*]}${NC}" echo -e " num-sequences: ${YELLOW}${NUM_SEQUENCES}${NC} (columns = $((2 * NUM_SEQUENCES)))" echo -e " runs/size: ${YELLOW}${RUNS}${NC} (median + CV reported)" -echo -e " p3 extension: ${YELLOW}degree 3 (forked p3-goldilocks, matches Lambda)${NC}" +echo -e " p3 extension: ${YELLOW}upstream CubicTrinomialExtensionField (x^3 - x - 1)${NC}" +echo -e " p3 mmcs: ${YELLOW}scalar Keccak256 (val_packing_width=1, hash_lanes=1)${NC}" echo -e " proof params: ${YELLOW}blowup=${BLOWUP}, queries=${FRI_QUERIES}, grinding=${GRINDING}${NC}" if $BREAKDOWN; then echo -e " breakdown: ${YELLOW}on${NC} (Lambda instruments + P3 tracing spans)" @@ -206,6 +207,13 @@ extract_metrics_line() { }' } +extract_audit_line() { + sed -n '/^AUDIT / { + p + q + }' +} + metric_value() { local line=$1 local key=$2 @@ -334,8 +342,10 @@ run_prover() { local log_rows=$2 local times=() local metrics_file="$TMP_DIR/${prover}_${log_rows}.metrics" + local audit_file="$TMP_DIR/${prover}_${log_rows}.audits" local breakdown_file="$TMP_DIR/${prover}_${log_rows}.breakdown" : > "$metrics_file" + : > "$audit_file" : > "$breakdown_file" for run_i in $(seq 1 "$RUNS"); do local out_file="$TMP_DIR/${prover}_${log_rows}_${run_i}.stdout" @@ -348,6 +358,11 @@ run_prover() { cat "$out_file" exit 1 fi + local audit_line + audit_line=$(extract_audit_line < "$out_file") + if [ -n "$audit_line" ]; then + printf 'run=%s\t%s\n' "$run_i" "$audit_line" >> "$audit_file" + fi local metrics_line metrics_line=$(extract_metrics_line < "$out_file") if [ -z "$metrics_line" ]; then @@ -537,6 +552,20 @@ if [ -n "$REPORT_DIR" ]; then done } > "$REPORT_DIR/raw_metrics.tsv" + # Raw AUDIT lines per run, one row per prover×log_rows×run. Lets the reader + # confirm in retrospect that val_packing_width=1, hash_lanes=1, etc. + { + printf "run\taudit_line\n" + for lr in "${RESULT_LOG_ROWS[@]}"; do + for prover in lambda p3; do + audit_file="$TMP_DIR/${prover}_${lr}.audits" + if [ -f "$audit_file" ]; then + cat "$audit_file" + fi + done + done + } > "$REPORT_DIR/raw_audits.tsv" + if $BREAKDOWN; then { printf "run\tworkload\tprover\tlog_rows\trows\tphase\tms\ttable\ttable_rows\tspan\n" @@ -588,7 +617,8 @@ if [ -n "$REPORT_DIR" ]; then else echo "breakdown=off" fi - echo "p3_extension=degree3_fork" + echo "p3_extension=upstream_cubic_trinomial" + echo "p3_mmcs=scalar_keccak256" if $SCALAR_ACTIVE; then echo "scalar=on" echo "rustflags=$SCALAR_RUSTFLAGS" diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs index 66d9baacd..b71b2ce4e 100644 --- a/bench_vs_plonky3/src/bin/prove_bench.rs +++ b/bench_vs_plonky3/src/bin/prove_bench.rs @@ -520,6 +520,60 @@ fn run_p3(args: &Args) -> BenchMetrics { } } +fn print_audit(args: &Args) { + let prover_name = match args.prover { + ProverKind::Lambda => "lambda", + ProverKind::P3 => "p3", + }; + let rows = 1usize << args.log_rows; + let main_cols = 2 * args.num_sequences; + let trace_cells = rows * main_cols; + let public_values = 2 * args.num_sequences; + let transition_constraints = 2 * args.num_sequences; + + // Common prefix. + let common = format!( + "AUDIT\tprover={prover_name}\tworkload=fib_pair\tlog_rows={}\trows={rows}\t\ + main_cols={main_cols}\taux_cols=0\ttrace_cells={trace_cells}\t\ + public_values={public_values}", + args.log_rows, + ); + + // Per-prover audit fields. + let prover_specific = match args.prover { + ProverKind::Lambda => format!( + "transition_constraints={transition_constraints}\t\ + base_transition_constraints={transition_constraints}\t\ + boundary_constraints={transition_constraints}\t\ + composition_chunks=1" + ), + ProverKind::P3 => { + // P3 counts 2*num_sequences first-row constraints (boundary equivalent, + // encoded inside the AIR via `when_first_row`) + 2*num_sequences + // transition constraints, total 4*num_sequences. + let air_constraints = 4 * args.num_sequences; + let first_row_constraints = 2 * args.num_sequences; + format!( + "air_constraints={air_constraints}\t\ + first_row_constraints={first_row_constraints}\t\ + transition_constraints={transition_constraints}\t\ + boundary_constraints=0\tquotient_chunks=1\t\ + val_packing_width={}\thash_lanes={}", + plonky3_config::VAL_PACKING_WIDTH, + plonky3_config::HASH_LANES, + ) + } + }; + + let tail = format!( + "blowup={}\tqueries={}\tgrinding={}\t\ + trace_generation_timed=false\tverify_in_ratio=false", + args.blowup, args.queries, args.grinding, + ); + + println!("{common}\t{prover_specific}\t{tail}"); +} + fn main() -> ExitCode { let args = match parse_args() { Ok(a) => a, @@ -530,6 +584,8 @@ fn main() -> ExitCode { } }; + print_audit(&args); + let metrics = match args.prover { ProverKind::Lambda => run_lambda(&args), ProverKind::P3 => run_p3(&args), diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs index 9c1ca6024..60596bddc 100644 --- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs +++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs @@ -104,6 +104,23 @@ where } } } + + fn evaluate_prover( + &self, + eval_ctx: &TransitionEvaluationContext, + base_evals: &mut [FieldElement], + _ext_evals: &mut [FieldElement], + ) { + let TransitionEvaluationContext::Prover { frame, .. } = eval_ctx else { + unreachable!("evaluate_prover called with non-Prover context"); + }; + let s0 = frame.get_evaluation_step(0); + let s1 = frame.get_evaluation_step(1); + let local_left = s0.get_main_evaluation_element(0, 2 * self.seq_idx); + let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx); + base_evals[self.constraint_idx] = next_left - local_left - local_right; + } } /// `next.right = local.right + next.left` @@ -177,6 +194,23 @@ where } } } + + fn evaluate_prover( + &self, + eval_ctx: &TransitionEvaluationContext, + base_evals: &mut [FieldElement], + _ext_evals: &mut [FieldElement], + ) { + let TransitionEvaluationContext::Prover { frame, .. } = eval_ctx else { + unreachable!("evaluate_prover called with non-Prover context"); + }; + let s0 = frame.get_evaluation_step(0); + let s1 = frame.get_evaluation_step(1); + let local_right = s0.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + let next_left = s1.get_main_evaluation_element(0, 2 * self.seq_idx); + let next_right = s1.get_main_evaluation_element(0, 2 * self.seq_idx + 1); + base_evals[self.constraint_idx] = next_right - local_right - next_left; + } } /// Public inputs: initial `(a, b) = (left, right)` pair for each sequence. @@ -226,6 +260,10 @@ where &self.constraints } + fn num_base_transition_constraints(&self) -> usize { + 2 * self.num_sequences + } + fn boundary_constraints( &self, pub_inputs: &Self::PublicInputs, diff --git a/bench_vs_plonky3/src/plonky3_config.rs b/bench_vs_plonky3/src/plonky3_config.rs index 774009bba..d0ead2657 100644 --- a/bench_vs_plonky3/src/plonky3_config.rs +++ b/bench_vs_plonky3/src/plonky3_config.rs @@ -4,26 +4,24 @@ use p3_dft::Radix2DitParallel; use p3_field::extension::CubicTrinomialExtensionField; use p3_fri::{FriParameters, TwoAdicFriPcs}; use p3_goldilocks::Goldilocks; -use p3_keccak::{Keccak256Hash, KeccakF}; +use p3_keccak::Keccak256Hash; use p3_merkle_tree::MerkleTreeMmcs; -use p3_symmetric::{CompressionFunctionFromHasher, PaddingFreeSponge, SerializingHasher}; +use p3_symmetric::{CompressionFunctionFromHasher, SerializingHasher}; use p3_uni_stark::StarkConfig; pub type Val = Goldilocks; pub type Challenge = CubicTrinomialExtensionField; +// Scalar byte-oriented MMCS, deliberately not the Plonky3 production config. +// Leaves are individual field elements, digests are 32 raw bytes, and the +// underlying Keccak path is single-input tiny_keccak. This removes the +// `[Val; VECTOR_LEN]` / `[u64; VECTOR_LEN]` Keccak lanes that the +// vector-friendly upstream config uses (NEON=2, SSE2=2, AVX2=4, AVX-512=8), +// so the Merkle compression cost is one Keccak-f per call on both sides. type ByteHash = Keccak256Hash; -type U64Hash = PaddingFreeSponge; -type FieldHash = SerializingHasher; -type MyCompress = CompressionFunctionFromHasher; -pub type ValMmcs = MerkleTreeMmcs< - [Val; p3_keccak::VECTOR_LEN], - [u64; p3_keccak::VECTOR_LEN], - FieldHash, - MyCompress, - 2, - 4, ->; +type FieldHash = SerializingHasher; +type MyCompress = CompressionFunctionFromHasher; +pub type ValMmcs = MerkleTreeMmcs; type ChallengeMmcs = ExtensionMmcs; type Dft = Radix2DitParallel; pub type Pcs = TwoAdicFriPcs; @@ -31,11 +29,19 @@ pub type Challenger = SerializingChallenger64; +/// Packing width of the MMCS leaves (`P` parameter of `MerkleTreeMmcs`). +/// `Val` directly = 1; `[Val; N]` would be `N`. Exposed for the AUDIT line. +pub const VAL_PACKING_WIDTH: usize = 1; + +/// Lanes of the underlying Keccak permutation as seen by the MMCS. +/// `Keccak256Hash` is single-input scalar; lane-vectorized `KeccakF` paths +/// would set this to 2/4/8 depending on arch. +pub const HASH_LANES: usize = 1; + fn build_mmcs() -> (ValMmcs, ChallengeMmcs, ByteHash) { let byte_hash = ByteHash {}; - let u64_hash = U64Hash::new(KeccakF {}); - let field_hash = FieldHash::new(u64_hash); - let compress = MyCompress::new(u64_hash); + let field_hash = FieldHash::new(byte_hash); + let compress = MyCompress::new(byte_hash); let val_mmcs = ValMmcs::new(field_hash, compress, 3); let challenge_mmcs = ChallengeMmcs::new(val_mmcs.clone()); (val_mmcs, challenge_mmcs, byte_hash) From e9d5abb889958c9cc40e02d8d5f0729cf36f9b5f Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 19 May 2026 12:15:53 -0300 Subject: [PATCH 31/34] Refresh Plonky3 to latest main --- bench_vs_plonky3/Cargo.toml | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/bench_vs_plonky3/Cargo.toml b/bench_vs_plonky3/Cargo.toml index 26bb49cc1..8fef10667 100644 --- a/bench_vs_plonky3/Cargo.toml +++ b/bench_vs_plonky3/Cargo.toml @@ -12,21 +12,18 @@ math = { path = "../crypto/math", features = [ "lambdaworks-serde-binary", ] } -# Pinned to the commit currently resolved in Cargo.lock so the benchmark is -# reproducible against an exact P3 revision. Bumping is fine; it must be an -# explicit decision, not the result of an unrelated `cargo update`. -p3-air = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-field = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-commit = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-fri = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae" } -p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] } -p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", rev = "de83ef4367b66d5908623c6503946ffcfdc3b6ae", features = ["parallel"] } +p3-air = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-field = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-matrix = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-commit = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-challenger = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-merkle-tree = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-keccak = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-fri = { git = "https://github.com/Plonky3/Plonky3.git" } +p3-uni-stark = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] } +p3-dft = { git = "https://github.com/Plonky3/Plonky3.git", features = ["parallel"] } # Tracing for P3 span-based profiling tracing = "0.1" From 18f1f0c7cc173b7002ec9fc93211c2d1a6921306 Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Tue, 19 May 2026 15:49:59 -0300 Subject: [PATCH 32/34] cleanup --- .github/scripts/publish_bench_vs.sh | 42 +++++++++++++------------- .github/workflows/bench-vs-nightly.yml | 5 ++- bench_vs_plonky3/run_p3_nightly.sh | 41 ------------------------- 3 files changed, 25 insertions(+), 63 deletions(-) delete mode 100755 bench_vs_plonky3/run_p3_nightly.sh diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh index 01e80ee0c..f96181e8c 100644 --- a/.github/scripts/publish_bench_vs.sh +++ b/.github/scripts/publish_bench_vs.sh @@ -118,22 +118,22 @@ p3_fmt_ratio_pair() { } P3_SECTION="" -P3_HEADLINE_FILE="bench_vs_artifacts/p3/headline/metrics.txt" -if [ -f "$P3_HEADLINE_FILE" ]; then - H_LOG_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "log_rows_series") - H_COLS=$(p3_parse "$P3_HEADLINE_FILE" "columns") - H_BLOWUP=$(p3_parse "$P3_HEADLINE_FILE" "blowup") - H_QUERIES=$(p3_parse "$P3_HEADLINE_FILE" "fri_queries") - H_ROWS=$(p3_parse "$P3_HEADLINE_FILE" "rows_series") - H_LAMBDA_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "lambda_prove_medians") - H_P3_PROVE=$(p3_parse "$P3_HEADLINE_FILE" "p3_prove_medians") - H_LAMBDA_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "lambda_verify_medians") - H_P3_VERIFY=$(p3_parse "$P3_HEADLINE_FILE" "p3_verify_medians") - H_LAMBDA_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "lambda_proof_size_medians") - H_P3_PROOF=$(p3_parse "$P3_HEADLINE_FILE" "p3_proof_size_medians") - H_LAMBDA_RSS=$(p3_parse "$P3_HEADLINE_FILE" "lambda_peak_rss_medians") - H_P3_RSS=$(p3_parse "$P3_HEADLINE_FILE" "p3_peak_rss_medians") - H_RATIO=$(p3_parse "$P3_HEADLINE_FILE" "ratios_lambda_over_p3") +P3_FILE="bench_vs_artifacts/p3/metrics.txt" +if [ -f "$P3_FILE" ]; then + H_LOG_ROWS=$(p3_parse "$P3_FILE" "log_rows_series") + H_COLS=$(p3_parse "$P3_FILE" "columns") + H_BLOWUP=$(p3_parse "$P3_FILE" "blowup") + H_QUERIES=$(p3_parse "$P3_FILE" "fri_queries") + H_ROWS=$(p3_parse "$P3_FILE" "rows_series") + H_LAMBDA_PROVE=$(p3_parse "$P3_FILE" "lambda_prove_medians") + H_P3_PROVE=$(p3_parse "$P3_FILE" "p3_prove_medians") + H_LAMBDA_VERIFY=$(p3_parse "$P3_FILE" "lambda_verify_medians") + H_P3_VERIFY=$(p3_parse "$P3_FILE" "p3_verify_medians") + H_LAMBDA_PROOF=$(p3_parse "$P3_FILE" "lambda_proof_size_medians") + H_P3_PROOF=$(p3_parse "$P3_FILE" "p3_proof_size_medians") + H_LAMBDA_RSS=$(p3_parse "$P3_FILE" "lambda_peak_rss_medians") + H_P3_RSS=$(p3_parse "$P3_FILE" "p3_peak_rss_medians") + H_RATIO=$(p3_parse "$P3_FILE" "ratios_lambda_over_p3") H_ROWS_FMT=$(LC_NUMERIC=C awk -v r="$H_ROWS" 'BEGIN { if (r == "") { print "n/a"; exit } @@ -149,12 +149,12 @@ if [ -f "$P3_HEADLINE_FILE" ]; then printf "%.2fx", r }') - P3_HEADLINE_MRKDWN="*log_rows=${H_LOG_ROWS} (${H_ROWS_FMT} rows · ${H_COLS} cols · blowup=${H_BLOWUP} · ${H_QUERIES} queries)*" - P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Lambda:* $(p3_fmt_seconds "$H_LAMBDA_PROVE") prove · $(p3_fmt_seconds "$H_LAMBDA_VERIFY") verify · $(p3_fmt_mb "$H_LAMBDA_PROOF") proof · $(p3_fmt_gb "$H_LAMBDA_RSS") RSS" - P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS" - P3_HEADLINE_MRKDWN="${P3_HEADLINE_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS" + P3_MRKDWN="*log_rows=${H_LOG_ROWS} (${H_ROWS_FMT} rows · ${H_COLS} cols · blowup=${H_BLOWUP} · ${H_QUERIES} queries)*" + P3_MRKDWN="${P3_MRKDWN}\\n*Lambda:* $(p3_fmt_seconds "$H_LAMBDA_PROVE") prove · $(p3_fmt_seconds "$H_LAMBDA_VERIFY") verify · $(p3_fmt_mb "$H_LAMBDA_PROOF") proof · $(p3_fmt_gb "$H_LAMBDA_RSS") RSS" + P3_MRKDWN="${P3_MRKDWN}\\n*Plonky3:* $(p3_fmt_seconds "$H_P3_PROVE") prove · $(p3_fmt_seconds "$H_P3_VERIFY") verify · $(p3_fmt_mb "$H_P3_PROOF") proof · $(p3_fmt_gb "$H_P3_RSS") RSS" + P3_MRKDWN="${P3_MRKDWN}\\n*Ratio L/P3:* ${PROVE_RATIO_FMT} prove · ${PROOF_RATIO} proof · ${RSS_RATIO} RSS" - P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3 - Headline"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_HEADLINE_MRKDWN"'"}}' + P3_SECTION=',{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs Plonky3"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$P3_MRKDWN"'"}}' fi ETHREX_METRICS_FILE="bench_vs_artifacts/ethrex_metrics.txt" diff --git a/.github/workflows/bench-vs-nightly.yml b/.github/workflows/bench-vs-nightly.yml index 152ce95a1..4d21a0a31 100644 --- a/.github/workflows/bench-vs-nightly.yml +++ b/.github/workflows/bench-vs-nightly.yml @@ -71,7 +71,10 @@ jobs: -p p3-uni-stark -p p3-dft - name: Run Plonky3 nightly benchmark - run: bash ./bench_vs_plonky3/run_p3_nightly.sh bench_vs_artifacts/p3 + run: | + bash ./bench_vs_plonky3/run.sh \ + --log-rows 21 --num-sequences 16 --runs 10 --scalar \ + --report-dir bench_vs_artifacts/p3 --no-color - name: Upload nightly benchmark artifact uses: actions/upload-artifact@v4 diff --git a/bench_vs_plonky3/run_p3_nightly.sh b/bench_vs_plonky3/run_p3_nightly.sh deleted file mode 100755 index 47957b6ff..000000000 --- a/bench_vs_plonky3/run_p3_nightly.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -# Orchestrates the Lambda-vs-Plonky3 nightly benchmark. -# -# Runs the headline configuration (log_rows=21, num_sequences=16 → 32 cols) -# into `$REPORT_BASE/headline/`. Consumed by -# `.github/scripts/publish_bench_vs.sh` to render the Headline section of -# the Slack post. -# -# Usage: -# ./bench_vs_plonky3/run_p3_nightly.sh [REPORT_BASE] -# -# Defaults: REPORT_BASE=bench_vs_artifacts/p3 - -set -euo pipefail - -REPORT_BASE="${1:-bench_vs_artifacts/p3}" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -RUN_SH="$SCRIPT_DIR/run.sh" - -if [ ! -x "$RUN_SH" ]; then - echo "run.sh not found or not executable at $RUN_SH" >&2 - exit 1 -fi - -run_one() { - local label=$1 - local log_rows=$2 - local num_sequences=$3 - local out_dir="$REPORT_BASE/$label" - echo - echo "=== ${label} (log_rows=${log_rows}, num_sequences=${num_sequences}) ===" - bash "$RUN_SH" \ - --log-rows "$log_rows" \ - --num-sequences "$num_sequences" \ - --runs 10 \ - --scalar \ - --report-dir "$out_dir" \ - --no-color -} - -run_one headline 21 16 From bb7d8baad3e7d71088c9071850c34d0f0fca5d2e Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Wed, 20 May 2026 11:04:55 -0300 Subject: [PATCH 33/34] address comments --- .github/scripts/publish_bench_vs.sh | 4 +- bench_vs_plonky3/README.md | 42 +++++---- bench_vs_plonky3/run.sh | 32 +------ bench_vs_plonky3/src/bin/prove_bench.rs | 78 ++--------------- bench_vs_plonky3/src/lambda_fibonacci_pair.rs | 7 ++ bench_vs_plonky3/src/lib.rs | 87 +------------------ 6 files changed, 48 insertions(+), 202 deletions(-) diff --git a/.github/scripts/publish_bench_vs.sh b/.github/scripts/publish_bench_vs.sh index f96181e8c..d1585cd9f 100644 --- a/.github/scripts/publish_bench_vs.sh +++ b/.github/scripts/publish_bench_vs.sh @@ -13,7 +13,7 @@ METRICS_FILE="bench_vs_artifacts/metrics.txt" if [ ! -f "$METRICS_FILE" ]; then curl -X POST "$WEBHOOK_URL" \ -H 'Content-Type: application/json; charset=utf-8' \ - --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"section","text":{"type":"mrkdwn","text":":x: Benchmark failed - no metrics found. Check the workflow logs."}}]}' + --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM Nightly Benchmark"}},{"type":"section","text":{"type":"mrkdwn","text":":x: Benchmark failed - no metrics found. Check the workflow logs."}}]}' exit 0 fi @@ -173,4 +173,4 @@ fi curl -X POST "$WEBHOOK_URL" \ -H 'Content-Type: application/json; charset=utf-8' \ - --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6 - Nightly Benchmark"}},{"type":"context","elements":[{"type":"mrkdwn","text":"*Program:* Fibonacci · *Device:* CPU"}]},{"type":"divider"},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION$ETHREX_SECTION$P3_SECTION"']}' + --data '{"blocks":[{"type":"header","text":{"type":"plain_text","text":"Lambda VM Nightly Benchmark"}},{"type":"context","elements":[{"type":"mrkdwn","text":"*Program:* Fibonacci · *Device:* CPU"}]},{"type":"divider"},{"type":"header","text":{"type":"plain_text","text":"Lambda VM vs SP1 v6"}},{"type":"section","text":{"type":"mrkdwn","text":"'"$RESULTS_MRKDWN"'"}}'"$PROJ_SECTION$ETHREX_SECTION$P3_SECTION"']}' diff --git a/bench_vs_plonky3/README.md b/bench_vs_plonky3/README.md index a78249f31..60d44792a 100644 --- a/bench_vs_plonky3/README.md +++ b/bench_vs_plonky3/README.md @@ -81,22 +81,25 @@ Stdout (without `--report-dir`): [build] prove_bench --- log-rows=19 (rows = 524288) --- - [lambda] median 2.444s from 3 runs: 2.444,2.279,2.830 - [p3] median 0.988s from 3 runs: 0.981,0.993,0.988 + [lambda] prove median 0.574s (CV 3.07%), verify 0.024s, proof 4116000 B, rss 805000 KB + [p3] prove median 0.324s (CV 2.85%), verify 0.019s, proof 1987000 B, rss 627000 KB === Summary === - log-rows rows Lambda (s) P3 (s) L/P3 - -------- ---- ---------- ------ ---- - 19 524288 2.444s 0.988s 2.474x (P3 faster) + log-rows rows Lambda (s) L CV% P3 (s) P3 CV% L/P3 + -------- ---- ---------- ----- ------ ------ ---- + 19 524288 0.574s 3.07% 0.324s 2.85% 1.770x (P3 faster) -Timing window: single-shot end-to-end prove. -Ratio = Lambda / P3. ratio > 1 → P3 faster (Lambda took ratio× longer); ratio < 1 → Lambda faster. +Timing window: prove only for the ratio. Verify, proof size, RSS and throughput are reported separately. ``` With `--report-dir DIR` the script writes: -- `results.tsv` — tab-separated raw data (`log_rows, rows, lambda_median_s, - p3_median_s, ratio_lambda_over_p3, runs`). +- `results.tsv` — tab-separated, one row per `log_rows` size with 14 columns: + `log_rows, rows, lambda_prove_median_s, lambda_prove_cv_pct, + lambda_verify_median_s, lambda_proof_size_bytes_median, + lambda_peak_rss_kb_median, p3_prove_median_s, p3_prove_cv_pct, + p3_verify_median_s, p3_proof_size_bytes_median, p3_peak_rss_kb_median, + ratio_lambda_over_p3, runs`. - `raw_metrics.tsv` — one row per `(prover, log_rows, run)` with all `METRICS` fields parsed out. - `raw_audits.tsv` — one row per `(prover, log_rows, run)` with the AUDIT @@ -114,22 +117,27 @@ downstream tooling. ## Nightly -A GitHub Actions workflow (`.github/workflows/bench-vs-p3-nightly.yml`) runs -daily at 07:30 UTC (04:30 Buenos Aires, after the SP1 nightly completes) on -the self-hosted `bench` runner. It executes: +The Lambda-vs-Plonky3 bench is part of the shared +`.github/workflows/bench-vs-nightly.yml` workflow, which runs daily at +06:00 UTC (03:00 Buenos Aires) on the self-hosted `bench` runner. The P3 +step executes after the Lambda-vs-SP1 and ethrex empty-block steps: ```bash bash ./bench_vs_plonky3/run.sh \ - --log-rows 19 \ + --log-rows 21 \ --num-sequences 16 \ - --runs 3 \ + --runs 10 \ --scalar \ - --report-dir bench_vs_p3_artifacts \ + --report-dir bench_vs_artifacts/p3 \ --no-color ``` -The `bench_vs_p3_artifacts/` directory is uploaded as an artifact named -`bench-vs-p3-nightly--` with 90-day retention. +A `cargo update -p p3-*` runs before this step so the bench tracks the +latest upstream Plonky3 `main`. The full `bench_vs_artifacts/` directory +(SP1 + ethrex + P3 outputs) is uploaded as one artifact named +`bench-vs-nightly--` with 90-day retention. A "Lambda +VM vs Plonky3" section is appended to the same Slack post that publishes +the SP1 and ethrex results. ## Breakdown (per-phase timing) for manual analysis diff --git a/bench_vs_plonky3/run.sh b/bench_vs_plonky3/run.sh index 20f1f6331..a4ce67fc2 100755 --- a/bench_vs_plonky3/run.sh +++ b/bench_vs_plonky3/run.sh @@ -242,10 +242,6 @@ ratio_fmt() { }' } -mean_file() { - LC_NUMERIC=C awk '{ s += $1; n++ } END { if (n == 0) print "n/a"; else printf "%.6f\n", s / n }' "$1" -} - median_file() { LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk ' { a[NR] = $0 + 0 } @@ -256,18 +252,6 @@ median_file() { }' } -stddev_file() { - LC_NUMERIC=C awk ' - { s += $1; ss += $1 * $1; n++ } - END { - if (n == 0) { print "n/a"; exit } - m = s / n - v = (ss / n) - (m * m) - if (v < 0) v = 0 - printf "%.6f\n", sqrt(v) - }' "$1" -} - cv_pct_file() { LC_NUMERIC=C awk ' { s += $1; ss += $1 * $1; n++ } @@ -282,14 +266,6 @@ cv_pct_file() { }' "$1" } -min_file() { - LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk 'NR == 1 { printf "%.6f\n", $1; exit }' -} - -max_file() { - LC_ALL=C sort -g "$1" | LC_NUMERIC=C awk '{ x = $1 } END { if (NR == 0) print "n/a"; else printf "%.6f\n", x }' -} - fmt0() { LC_NUMERIC=C awk -v v="$1" 'BEGIN { if (v == "n/a") print v; else printf "%.0f\n", v }' } @@ -354,8 +330,8 @@ run_prover() { run_args+=(--breakdown) fi if ! "$BIN" "${run_args[@]}" > "$out_file" 2>&1; then - echo -e " ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}" - cat "$out_file" + echo -e " ${RED}[${prover}] FAILED on log-rows=${log_rows} run ${run_i}${NC}" >&2 + cat "$out_file" >&2 exit 1 fi local audit_line @@ -366,8 +342,8 @@ run_prover() { local metrics_line metrics_line=$(extract_metrics_line < "$out_file") if [ -z "$metrics_line" ]; then - echo -e " ${RED}[${prover}] could not parse metrics (log-rows=${log_rows}, run ${run_i})${NC}" - cat "$out_file" + echo -e " ${RED}[${prover}] could not parse metrics (log-rows=${log_rows}, run ${run_i})${NC}" >&2 + cat "$out_file" >&2 exit 1 fi printf '%s\n' "$metrics_line" >> "$metrics_file" diff --git a/bench_vs_plonky3/src/bin/prove_bench.rs b/bench_vs_plonky3/src/bin/prove_bench.rs index b71b2ce4e..c132f57a5 100644 --- a/bench_vs_plonky3/src/bin/prove_bench.rs +++ b/bench_vs_plonky3/src/bin/prove_bench.rs @@ -12,9 +12,9 @@ //! log-rows=19, num-sequences=16, blowup=2, queries=219, grinding=0. use std::process::ExitCode; -use std::sync::{Arc, Mutex}; use std::time::Instant; +use bench_vs_plonky3::span_timing::{P3TimingLayer, SpanResults as P3SpanResults}; use bench_vs_plonky3::{lambda_fibonacci_pair, plonky3_config, plonky3_fibonacci}; use crypto::fiat_shamir::default_transcript::DefaultTranscript; use math::field::element::FieldElement; @@ -327,74 +327,8 @@ fn emit_lambda_breakdown(args: &Args, rows: usize, total_ms: f64) { eprintln!("warning: Lambda phase breakdown requires building with --features instruments"); } -struct SpanState { - name: String, - active_since: Option, - accumulated: std::time::Duration, -} - -struct P3TimingLayer { - spans: Mutex>, - results: Arc>>, -} - -impl tracing_subscriber::registry::LookupSpan<'lookup>> - tracing_subscriber::Layer for P3TimingLayer -{ - fn on_new_span( - &self, - attrs: &tracing::span::Attributes<'_>, - id: &tracing::span::Id, - _ctx: tracing_subscriber::layer::Context<'_, S>, - ) { - self.spans.lock().unwrap().insert( - id.into_u64(), - SpanState { - name: attrs.metadata().name().to_string(), - active_since: None, - accumulated: std::time::Duration::ZERO, - }, - ); - } - - fn on_enter(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { - if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) - && entry.active_since.is_none() - { - entry.active_since = Some(Instant::now()); - } - } - - fn on_exit(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { - if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) - && let Some(start) = entry.active_since.take() - { - entry.accumulated += start.elapsed(); - } - } - - fn on_close(&self, id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { - if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) { - let mut total = entry.accumulated; - if let Some(start) = entry.active_since { - total += start.elapsed(); - } - self.results - .lock() - .unwrap() - .push((entry.name, ms(total.as_secs_f64()))); - } - } -} - -type P3SpanResults = Arc>>; - fn p3_span_subscriber() -> (impl tracing::Subscriber + Send + Sync, P3SpanResults) { - let results = Arc::new(Mutex::new(Vec::new())); - let layer = P3TimingLayer { - spans: Mutex::new(std::collections::HashMap::new()), - results: Arc::clone(&results), - }; + let (layer, results) = P3TimingLayer::new(); let filter = tracing_subscriber::filter::LevelFilter::DEBUG; ( tracing_subscriber::registry().with(filter).with(layer), @@ -411,13 +345,17 @@ fn peak_rss_kb() -> Option { } let maxrss = unsafe { usage.assume_init().ru_maxrss }; + if maxrss < 0 { + return None; + } + let maxrss = maxrss as u64; #[cfg(target_os = "macos")] { - Some((maxrss as u64).div_ceil(1024)) + Some(maxrss.div_ceil(1024)) } #[cfg(not(target_os = "macos"))] { - Some(maxrss as u64) + Some(maxrss) } } diff --git a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs index 60596bddc..bae1235dc 100644 --- a/bench_vs_plonky3/src/lambda_fibonacci_pair.rs +++ b/bench_vs_plonky3/src/lambda_fibonacci_pair.rs @@ -271,6 +271,13 @@ where _bus_public_inputs: Option<&stark::lookup::BusPublicInputs>, _trace_length: usize, ) -> BoundaryConstraints { + assert_eq!( + pub_inputs.initial_values.len(), + self.num_sequences, + "AIR built for {} sequences, public inputs carry {}", + self.num_sequences, + pub_inputs.initial_values.len(), + ); let mut constraints = Vec::with_capacity(2 * pub_inputs.initial_values.len()); for (seq_idx, (a, b)) in pub_inputs.initial_values.iter().enumerate() { constraints.push(BoundaryConstraint::new_main( diff --git a/bench_vs_plonky3/src/lib.rs b/bench_vs_plonky3/src/lib.rs index 7c722153e..dd5cbf675 100644 --- a/bench_vs_plonky3/src/lib.rs +++ b/bench_vs_plonky3/src/lib.rs @@ -1,6 +1,7 @@ pub mod lambda_fibonacci_pair; pub mod plonky3_config; pub mod plonky3_fibonacci; +pub mod span_timing; #[cfg(test)] mod tests { @@ -206,93 +207,9 @@ mod tests { println!("\n============================================================"); println!("Plonky3 STARK Span Breakdown"); - use std::collections::HashMap; - use std::sync::{Arc, Mutex}; use tracing_subscriber::layer::SubscriberExt; - type SpanResults = Arc>>; - - struct SpanState { - name: String, - active_since: Option, - accumulated: std::time::Duration, - } - - struct P3TimingLayer { - spans: Mutex>, - results: SpanResults, - } - - impl< - S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>, - > tracing_subscriber::Layer for P3TimingLayer - { - fn on_new_span( - &self, - attrs: &tracing::span::Attributes<'_>, - id: &tracing::span::Id, - _ctx: tracing_subscriber::layer::Context<'_, S>, - ) { - let name = attrs.metadata().name().to_string(); - self.spans.lock().unwrap().insert( - id.into_u64(), - SpanState { - name, - active_since: None, - accumulated: std::time::Duration::ZERO, - }, - ); - } - - // Rayon can re-enter a span across threads, so only start timing on - // the first enter after each exit; accumulate every interval. - fn on_enter( - &self, - id: &tracing::span::Id, - _ctx: tracing_subscriber::layer::Context<'_, S>, - ) { - if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) - && entry.active_since.is_none() - { - entry.active_since = Some(std::time::Instant::now()); - } - } - - fn on_exit( - &self, - id: &tracing::span::Id, - _ctx: tracing_subscriber::layer::Context<'_, S>, - ) { - if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) - && let Some(start) = entry.active_since.take() - { - entry.accumulated += start.elapsed(); - } - } - - fn on_close( - &self, - id: tracing::span::Id, - _ctx: tracing_subscriber::layer::Context<'_, S>, - ) { - if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) { - // If we never saw on_exit (span closed while active), include - // the dangling interval. - let mut total = entry.accumulated; - if let Some(start) = entry.active_since { - total += start.elapsed(); - } - let ms = total.as_secs_f64() * 1000.0; - self.results.lock().unwrap().push((entry.name, ms)); - } - } - } - - let results: SpanResults = Arc::new(Mutex::new(Vec::new())); - let layer = P3TimingLayer { - spans: Mutex::new(HashMap::new()), - results: Arc::clone(&results), - }; + let (layer, results) = crate::span_timing::P3TimingLayer::new(); let filter = tracing_subscriber::filter::LevelFilter::DEBUG; let subscriber = tracing_subscriber::registry().with(filter).with(layer); From cd41dd7702759a95f1b573c7ab855d71940132df Mon Sep 17 00:00:00 2001 From: jotabulacios Date: Wed, 20 May 2026 12:06:38 -0300 Subject: [PATCH 34/34] add missing file --- bench_vs_plonky3/src/span_timing.rs | 83 +++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 bench_vs_plonky3/src/span_timing.rs diff --git a/bench_vs_plonky3/src/span_timing.rs b/bench_vs_plonky3/src/span_timing.rs new file mode 100644 index 000000000..4d37423fb --- /dev/null +++ b/bench_vs_plonky3/src/span_timing.rs @@ -0,0 +1,83 @@ +//! Tracing layer that accumulates per-span wall-clock durations from +//! Plonky3's `tracing` instrumentation. Used by `prove_bench --breakdown` +//! and by the `instruments_breakdown` test. + +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use tracing_subscriber::Layer; + +pub type SpanResults = Arc>>; + +struct SpanState { + name: String, + active_since: Option, + accumulated: Duration, +} + +pub struct P3TimingLayer { + spans: Mutex>, + results: SpanResults, +} + +impl P3TimingLayer { + pub fn new() -> (Self, SpanResults) { + let results: SpanResults = Arc::new(Mutex::new(Vec::new())); + let layer = Self { + spans: Mutex::new(HashMap::new()), + results: Arc::clone(&results), + }; + (layer, results) + } +} + +impl Layer for P3TimingLayer +where + S: tracing::Subscriber + for<'lookup> tracing_subscriber::registry::LookupSpan<'lookup>, +{ + fn on_new_span( + &self, + attrs: &tracing::span::Attributes<'_>, + id: &tracing::span::Id, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + self.spans.lock().unwrap().insert( + id.into_u64(), + SpanState { + name: attrs.metadata().name().to_string(), + active_since: None, + accumulated: Duration::ZERO, + }, + ); + } + + fn on_enter(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { + if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) + && entry.active_since.is_none() + { + entry.active_since = Some(Instant::now()); + } + } + + fn on_exit(&self, id: &tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { + if let Some(entry) = self.spans.lock().unwrap().get_mut(&id.into_u64()) + && let Some(start) = entry.active_since.take() + { + entry.accumulated += start.elapsed(); + } + } + + fn on_close(&self, id: tracing::span::Id, _ctx: tracing_subscriber::layer::Context<'_, S>) { + if let Some(entry) = self.spans.lock().unwrap().remove(&id.into_u64()) { + let mut total = entry.accumulated; + if let Some(start) = entry.active_since { + total += start.elapsed(); + } + self.results + .lock() + .unwrap() + .push((entry.name, total.as_secs_f64() * 1000.0)); + } + } +}