From d1a0abf4fff108ac80e07c3ca0b5bb00cf0eb970 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Wed, 6 May 2026 15:12:54 -0300 Subject: [PATCH 1/7] add first cuda files --- Cargo.lock | 32 + Cargo.toml | 1 + crypto/math-cuda/Cargo.toml | 22 + crypto/math-cuda/build.rs | 57 + crypto/math-cuda/kernels/arith.cu | 83 ++ crypto/math-cuda/kernels/ext3.cuh | 121 +++ crypto/math-cuda/kernels/goldilocks.cuh | 69 ++ crypto/math-cuda/kernels/ntt.cu | 285 +++++ crypto/math-cuda/src/device.rs | 251 +++++ crypto/math-cuda/src/lde.rs | 984 ++++++++++++++++++ crypto/math-cuda/src/lib.rs | 152 +++ crypto/math-cuda/src/ntt.rs | 211 ++++ crypto/math-cuda/tests/bench_quick.rs | 356 +++++++ crypto/math-cuda/tests/evaluate_coset_ext3.rs | 143 +++ crypto/math-cuda/tests/ext3.rs | 87 ++ crypto/math-cuda/tests/goldilocks.rs | 127 +++ crypto/math-cuda/tests/lde.rs | 112 ++ crypto/math-cuda/tests/lde_batch.rs | 96 ++ crypto/math-cuda/tests/lde_batch_ext3.rs | 161 +++ crypto/math-cuda/tests/ntt.rs | 136 +++ crypto/stark/Cargo.toml | 4 + 21 files changed, 3490 insertions(+) create mode 100644 crypto/math-cuda/Cargo.toml create mode 100644 crypto/math-cuda/build.rs create mode 100644 crypto/math-cuda/kernels/arith.cu create mode 100644 crypto/math-cuda/kernels/ext3.cuh create mode 100644 crypto/math-cuda/kernels/goldilocks.cuh create mode 100644 crypto/math-cuda/kernels/ntt.cu create mode 100644 crypto/math-cuda/src/device.rs create mode 100644 crypto/math-cuda/src/lde.rs create mode 100644 crypto/math-cuda/src/lib.rs create mode 100644 crypto/math-cuda/src/ntt.rs create mode 100644 crypto/math-cuda/tests/bench_quick.rs create mode 100644 crypto/math-cuda/tests/evaluate_coset_ext3.rs create mode 100644 crypto/math-cuda/tests/ext3.rs create mode 100644 crypto/math-cuda/tests/goldilocks.rs create mode 100644 crypto/math-cuda/tests/lde.rs create mode 100644 crypto/math-cuda/tests/lde_batch.rs create mode 100644 crypto/math-cuda/tests/lde_batch_ext3.rs create mode 100644 crypto/math-cuda/tests/ntt.rs diff --git a/Cargo.lock b/Cargo.lock index f6eea84d6..7b6ed3c62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -803,6 +803,15 @@ dependencies = [ "typenum", ] +[[package]] +name = "cudarc" +version = "0.19.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f071cd6a7b5d51607df76aa2d426aaabc7a74bc6bdb885b8afa63a880572ad9b" +dependencies = [ + "libloading", +] + [[package]] name = "darling" version = "0.21.3" @@ -1989,6 +1998,16 @@ version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libm" version = "0.2.15" @@ -2105,6 +2124,18 @@ dependencies = [ "serde_json", ] +[[package]] +name = "math-cuda" +version = "0.1.0" +dependencies = [ + "cudarc", + "math", + "rand 0.8.5", + "rand_chacha 0.3.1", + "rayon", + "sha3", +] + [[package]] name = "memchr" version = "2.7.6" @@ -3172,6 +3203,7 @@ dependencies = [ "itertools 0.11.0", "log", "math", + "math-cuda", "rayon", "serde", "serde-wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 4d10b7c44..e43dc7f0d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "crypto/stark", "crypto/crypto", "crypto/math", + "crypto/math-cuda", "bin/cli", ] diff --git a/crypto/math-cuda/Cargo.toml b/crypto/math-cuda/Cargo.toml new file mode 100644 index 000000000..8c22d1110 --- /dev/null +++ b/crypto/math-cuda/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "math-cuda" +description = "CUDA-accelerated FFT/NTT for Goldilocks (base field) used by the lambda-vm STARK prover" +version = "0.1.0" +edition = "2024" + +[dependencies] +cudarc = { version = "0.19", default-features = false, features = [ + "driver", + "nvrtc", + "std", + "cuda-12080", + "dynamic-loading", +] } +math = { path = "../math" } +rayon = "1.7" + +[dev-dependencies] +rand = { version = "0.8.5", features = ["std"] } +rand_chacha = "0.3.1" +rayon = "1.7" +sha3 = "0.10.8" diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs new file mode 100644 index 000000000..a6defb5ab --- /dev/null +++ b/crypto/math-cuda/build.rs @@ -0,0 +1,57 @@ +use std::env; +use std::path::PathBuf; +use std::process::Command; + +fn cuda_home() -> PathBuf { + env::var_os("CUDA_HOME") + .or_else(|| env::var_os("CUDA_PATH")) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("/usr/local/cuda")) +} + +fn nvcc_path() -> PathBuf { + cuda_home().join("bin").join("nvcc") +} + +fn compile_ptx(src: &str, out_name: &str) { + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + let src_path = manifest_dir.join("kernels").join(src); + let out_path = out_dir.join(out_name); + + println!("cargo:rerun-if-changed=kernels/{src}"); + println!("cargo:rerun-if-env-changed=CUDA_HOME"); + println!("cargo:rerun-if-env-changed=CUDA_PATH"); + println!("cargo:rerun-if-env-changed=CUDARC_NVCC_ARCH"); + + // Emit PTX for a virtual architecture; the CUDA driver JIT-compiles it for the + // actual GPU at load time, so one PTX works across Ada/Hopper/Blackwell. Override + // with CUDARC_NVCC_ARCH to pin a specific compute capability. + let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| "compute_89".to_string()); + + let status = Command::new(nvcc_path()) + .args([ + "--ptx", + "-O3", + "-std=c++17", + "-arch", + &arch, + "-o", + ]) + .arg(&out_path) + .arg(&src_path) + .status() + .expect("failed to invoke nvcc — is CUDA installed and CUDA_HOME set?"); + + if !status.success() { + panic!("nvcc failed compiling {}", src_path.display()); + } +} + +fn main() { + // Headers are not compiled; emit rerun-if-changed so edits trigger rebuilds. + println!("cargo:rerun-if-changed=kernels/goldilocks.cuh"); + println!("cargo:rerun-if-changed=kernels/ext3.cuh"); + compile_ptx("arith.cu", "arith.ptx"); + compile_ptx("ntt.cu", "ntt.ptx"); +} diff --git a/crypto/math-cuda/kernels/arith.cu b/crypto/math-cuda/kernels/arith.cu new file mode 100644 index 000000000..4bee9b8bb --- /dev/null +++ b/crypto/math-cuda/kernels/arith.cu @@ -0,0 +1,83 @@ +// Element-wise Goldilocks kernels used by the Phase-2 parity tests. These mirror +// the CPU reference in `crypto/math/src/field/goldilocks.rs` so raw u64 outputs +// are bit-identical to the CPU path. + +#include "goldilocks.cuh" +#include "ext3.cuh" + +using goldilocks::add; +using goldilocks::sub; +using goldilocks::mul; +using goldilocks::neg; + +extern "C" __global__ void vector_add_u64(const uint64_t *a, + const uint64_t *b, + uint64_t *c, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) c[tid] = a[tid] + b[tid]; // plain wrapping u64 add — toolchain sanity only. +} + +extern "C" __global__ void gl_add_kernel(const uint64_t *a, + const uint64_t *b, + uint64_t *c, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) c[tid] = add(a[tid], b[tid]); +} + +extern "C" __global__ void gl_sub_kernel(const uint64_t *a, + const uint64_t *b, + uint64_t *c, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) c[tid] = sub(a[tid], b[tid]); +} + +extern "C" __global__ void gl_mul_kernel(const uint64_t *a, + const uint64_t *b, + uint64_t *c, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) c[tid] = mul(a[tid], b[tid]); +} + +extern "C" __global__ void gl_neg_kernel(const uint64_t *a, + uint64_t *c, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) c[tid] = neg(a[tid]); +} + +// --------------------------------------------------------------------------- +// Ext3 (Goldilocks cubic extension) test kernels. +// Input/output arrays are interleaved [a_0, b_0, c_0, a_1, b_1, c_1, ...]. +// --------------------------------------------------------------------------- + +extern "C" __global__ void ext3_mul_kernel(const uint64_t *a_int, + const uint64_t *b_int, + uint64_t *c_int, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + ext3::Fe3 a = ext3::make(a_int[tid*3 + 0], a_int[tid*3 + 1], a_int[tid*3 + 2]); + ext3::Fe3 b = ext3::make(b_int[tid*3 + 0], b_int[tid*3 + 1], b_int[tid*3 + 2]); + ext3::Fe3 r = ext3::mul(a, b); + c_int[tid*3 + 0] = r.a; + c_int[tid*3 + 1] = r.b; + c_int[tid*3 + 2] = r.c; +} + +extern "C" __global__ void ext3_add_kernel(const uint64_t *a_int, + const uint64_t *b_int, + uint64_t *c_int, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + ext3::Fe3 a = ext3::make(a_int[tid*3 + 0], a_int[tid*3 + 1], a_int[tid*3 + 2]); + ext3::Fe3 b = ext3::make(b_int[tid*3 + 0], b_int[tid*3 + 1], b_int[tid*3 + 2]); + ext3::Fe3 r = ext3::add(a, b); + c_int[tid*3 + 0] = r.a; + c_int[tid*3 + 1] = r.b; + c_int[tid*3 + 2] = r.c; +} diff --git a/crypto/math-cuda/kernels/ext3.cuh b/crypto/math-cuda/kernels/ext3.cuh new file mode 100644 index 000000000..2f4040714 --- /dev/null +++ b/crypto/math-cuda/kernels/ext3.cuh @@ -0,0 +1,121 @@ +// Goldilocks cubic extension on device: Fp3 = Fp[w] / (w^3 - 2) +// where Fp is Goldilocks (2^64 - 2^32 + 1). +// +// Layout matches the CPU `Degree3GoldilocksExtensionField` (see +// `crypto/math/src/field/extensions_goldilocks.rs`): an element is a +// 3-tuple `(a, b, c)` representing `a + b*w + c*w^2`. +// +// The reducible `w^3 = 2` means cross-term products get a factor of 2: +// (a0 + a1*w + a2*w^2) * (b0 + b1*w + b2*w^2) +// = (a0*b0 + 2*(a1*b2 + a2*b1)) +// + (a0*b1 + a1*b0 + 2*a2*b2) * w +// + (a0*b2 + a1*b1 + a2*b0) * w^2 +// +// We use the same dot-product-of-three folding as the CPU (which saves +// reductions by summing u128 products before `reduce128`). CUDA has +// `__umul64hi` so we implement `dot_product_3` inline. + +#pragma once +#include "goldilocks.cuh" + +namespace ext3 { + +struct Fe3 { + uint64_t a, b, c; +}; + +__device__ __forceinline__ Fe3 make(uint64_t a, uint64_t b, uint64_t c) { + Fe3 r = {a, b, c}; + return r; +} + +__device__ __forceinline__ Fe3 zero() { return make(0, 0, 0); } +__device__ __forceinline__ Fe3 one() { return make(1, 0, 0); } + +__device__ __forceinline__ Fe3 add(const Fe3 &x, const Fe3 &y) { + return make(goldilocks::add(x.a, y.a), + goldilocks::add(x.b, y.b), + goldilocks::add(x.c, y.c)); +} + +__device__ __forceinline__ Fe3 sub(const Fe3 &x, const Fe3 &y) { + return make(goldilocks::sub(x.a, y.a), + goldilocks::sub(x.b, y.b), + goldilocks::sub(x.c, y.c)); +} + +__device__ __forceinline__ Fe3 neg(const Fe3 &x) { + return make(goldilocks::neg(x.a), + goldilocks::neg(x.b), + goldilocks::neg(x.c)); +} + +/// Mixed: base * ext3 → ext3 (componentwise). +__device__ __forceinline__ Fe3 mul_base(const Fe3 &x, uint64_t s) { + return make(goldilocks::mul(x.a, s), + goldilocks::mul(x.b, s), + goldilocks::mul(x.c, s)); +} + +/// Dot-product of three (a0*b0 + a1*b1 + a2*b2) mod p, with one reduce128 +/// on the sum of three u128 products. Matches CPU `dot_product_3`. +__device__ __forceinline__ uint64_t dot3(uint64_t a0, uint64_t b0, + uint64_t a1, uint64_t b1, + uint64_t a2, uint64_t b2) { + // Split the sum of three u128 products into hi/lo u128 halves, then + // reduce once. We track overflow-count (at most 2) and add EPSILON^2 + // per overflow, matching the CPU path. + // prod_i = a_i * b_i (u128) + uint64_t lo0 = a0 * b0, hi0 = __umul64hi(a0, b0); + uint64_t lo1 = a1 * b1, hi1 = __umul64hi(a1, b1); + uint64_t lo2 = a2 * b2, hi2 = __umul64hi(a2, b2); + + // sum01 = prod0 + prod1 (in u128 lanes) + uint64_t s01_lo = lo0 + lo1; + uint64_t carry01 = (s01_lo < lo0) ? 1ULL : 0ULL; + uint64_t s01_hi = hi0 + hi1 + carry01; + uint32_t over1 = (s01_hi < hi0 + carry01) ? 1u : 0u; // low-pass overflow + + // sum012 = sum01 + prod2 + uint64_t s012_lo = s01_lo + lo2; + uint64_t carry012 = (s012_lo < s01_lo) ? 1ULL : 0ULL; + uint64_t s012_hi = s01_hi + hi2 + carry012; + uint32_t over2 = (s012_hi < hi2 + carry012) ? 1u : 0u; + + uint64_t reduced = goldilocks::reduce128(s012_lo, s012_hi); + + uint32_t overflow_count = over1 + over2; + if (overflow_count > 0) { + // 2^128 mod p = EPSILON^2 (= (2^32 - 1)^2). + uint64_t eps = goldilocks::EPSILON; + uint64_t eps_sq = eps * eps; + reduced = goldilocks::add_no_canonicalize(reduced, eps_sq); + if (overflow_count > 1) { + reduced = goldilocks::add_no_canonicalize(reduced, eps_sq); + } + } + return reduced; +} + +/// Full ext3 × ext3 multiplication (matches CPU +/// `Degree3GoldilocksExtensionField::mul`). +__device__ __forceinline__ Fe3 mul(const Fe3 &x, const Fe3 &y) { + // c0 = x.a*y.a + x.b*(2*y.c) + x.c*(2*y.b) + // c1 = x.a*y.b + x.b*y.a + x.c*(2*y.c) + // c2 = x.a*y.c + x.b*y.b + x.c*y.a + uint64_t b1_2 = goldilocks::add(y.b, y.b); + uint64_t b2_2 = goldilocks::add(y.c, y.c); + + uint64_t c0 = dot3(x.a, y.a, x.b, b2_2, x.c, b1_2); + uint64_t c1 = dot3(x.a, y.b, x.b, y.a, x.c, b2_2); + uint64_t c2 = dot3(x.a, y.c, x.b, y.b, x.c, y.a); + return make(c0, c1, c2); +} + +__device__ __forceinline__ Fe3 canonical(const Fe3 &x) { + return make(goldilocks::canonical(x.a), + goldilocks::canonical(x.b), + goldilocks::canonical(x.c)); +} + +} // namespace ext3 diff --git a/crypto/math-cuda/kernels/goldilocks.cuh b/crypto/math-cuda/kernels/goldilocks.cuh new file mode 100644 index 000000000..5e296a390 --- /dev/null +++ b/crypto/math-cuda/kernels/goldilocks.cuh @@ -0,0 +1,69 @@ +// Goldilocks field on device. Ports `crypto/math/src/field/goldilocks.rs` one-to-one: +// - Representation: non-canonical u64 in [0, 2^64). Canonicalise only at boundaries. +// - Prime: 2^64 - 2^32 + 1. +// - Reduction: exploits 2^64 ≡ EPSILON (mod p) and 2^96 ≡ -1 (mod p). +// +// The arithmetic here must produce bit-identical u64 outputs to the CPU path so +// LDE parity tests can assert raw equality. + +#pragma once +#include + +namespace goldilocks { + +__device__ constexpr uint64_t PRIME = 0xFFFFFFFF00000001ULL; +__device__ constexpr uint64_t EPSILON = 0xFFFFFFFFULL; // 2^32 - 1 + +__device__ __forceinline__ uint64_t add_no_canonicalize(uint64_t x, uint64_t y) { + // Mirror of `add_no_canonicalize_trashing_input`: one add, one EPSILON bump on carry. + uint64_t sum = x + y; + return sum + (sum < x ? EPSILON : 0ULL); +} + +__device__ __forceinline__ uint64_t add(uint64_t a, uint64_t b) { + uint64_t sum = a + b; + uint64_t over1 = (sum < a) ? EPSILON : 0ULL; + uint64_t sum2 = sum + over1; + uint64_t over2 = (sum2 < sum) ? EPSILON : 0ULL; + return sum2 + over2; +} + +__device__ __forceinline__ uint64_t sub(uint64_t a, uint64_t b) { + uint64_t diff = a - b; + uint64_t under1 = (a < b) ? EPSILON : 0ULL; + uint64_t diff2 = diff - under1; + uint64_t under2 = (diff2 > diff) ? EPSILON : 0ULL; + return diff2 - under2; +} + +__device__ __forceinline__ uint64_t reduce128(uint64_t lo, uint64_t hi) { + uint64_t x_hi_hi = hi >> 32; + uint64_t x_hi_lo = hi & EPSILON; + + // 2^96 ≡ -1 (mod p): subtract x_hi_hi from lo, EPSILON-correct on borrow. + uint64_t t0 = lo - x_hi_hi; + if (lo < x_hi_hi) t0 -= EPSILON; + + // 2^64 ≡ EPSILON (mod p): x_hi_lo * EPSILON = (x_hi_lo << 32) - x_hi_lo. + uint64_t t1 = (x_hi_lo << 32) - x_hi_lo; + + return add_no_canonicalize(t0, t1); +} + +__device__ __forceinline__ uint64_t mul(uint64_t a, uint64_t b) { + uint64_t lo = a * b; + uint64_t hi = __umul64hi(a, b); + return reduce128(lo, hi); +} + +__device__ __forceinline__ uint64_t neg(uint64_t a) { + // `a` may be non-canonical. Canonicalise first, then p - a (or 0). + uint64_t canon = (a >= PRIME) ? (a - PRIME) : a; + return canon == 0 ? 0 : (PRIME - canon); +} + +__device__ __forceinline__ uint64_t canonical(uint64_t a) { + return (a >= PRIME) ? (a - PRIME) : a; +} + +} // namespace goldilocks diff --git a/crypto/math-cuda/kernels/ntt.cu b/crypto/math-cuda/kernels/ntt.cu new file mode 100644 index 000000000..2a5c8c786 --- /dev/null +++ b/crypto/math-cuda/kernels/ntt.cu @@ -0,0 +1,285 @@ +// Radix-2 DIT NTT over Goldilocks. One kernel per butterfly level; the caller +// runs `bit_reverse_permute` once before the first level. +// +// Input layout: bit-reversed-order coefficients (after `bit_reverse_permute`). +// Output layout: natural-order evaluations — matches the CPU `evaluate_fft` contract. +// +// Twiddle table: `tw[i] = ω^i` for i in [0, n/2). Stride-indexed per level. + +#include "goldilocks.cuh" + +using goldilocks::add; +using goldilocks::sub; +using goldilocks::mul; + +/// Reverse the low `log_n` bits of each index and swap x[i] ↔ x[rev(i)]. +/// One thread per index; guarded by `tid < rev` to avoid double-swap. +extern "C" __global__ void bit_reverse_permute(uint64_t *x, + uint64_t n, + uint64_t log_n) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + + // __brevll reverses all 64 bits; shift right so result lives in [0, n). + uint64_t rev = __brevll(tid) >> (64 - log_n); + if (tid < rev) { + uint64_t tmp = x[tid]; + x[tid] = x[rev]; + x[rev] = tmp; + } +} + +/// Pointwise multiply: x[i] *= w[i]. Used for coset scaling (w = g^i weights). +extern "C" __global__ void pointwise_mul(uint64_t *x, + const uint64_t *w, + uint64_t n) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) x[tid] = mul(x[tid], w[tid]); +} + +/// Broadcast scalar multiply: x[i] *= c. Used for the 1/n factor at the end of iNTT. +extern "C" __global__ void scalar_mul(uint64_t *x, + uint64_t c, + uint64_t n) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) x[tid] = mul(x[tid], c); +} + +// ============================================================================ +// BATCHED KERNELS +// +// One launch processes M columns at once. The device buffer holds M columns +// back-to-back; column `c` starts at `data + c * col_stride`. gridDim.y is +// the column index, so each block handles one (column, butterfly-window) pair. +// +// The same twiddle table is shared across all columns of a batch (they all +// NTT on the same domain). The coset weights are also shared. +// ============================================================================ + +extern "C" __global__ void bit_reverse_permute_batched(uint64_t *data, + uint64_t n, + uint64_t log_n, + uint64_t col_stride) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + uint64_t *x = data + (uint64_t)blockIdx.y * col_stride; + + uint64_t rev = __brevll(tid) >> (64 - log_n); + if (tid < rev) { + uint64_t tmp = x[tid]; + x[tid] = x[rev]; + x[rev] = tmp; + } +} + +extern "C" __global__ void ntt_dit_level_batched(uint64_t *data, + const uint64_t *tw, + uint64_t n, + uint64_t log_n, + uint64_t level, + uint64_t col_stride) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + uint64_t n_half = n >> 1; + if (tid >= n_half) return; + uint64_t *x = data + (uint64_t)blockIdx.y * col_stride; + + uint64_t half = 1ULL << level; + uint64_t block_size = half << 1; + uint64_t block_idx = tid >> level; + uint64_t k = tid & (half - 1); + + uint64_t i0 = block_idx * block_size + k; + uint64_t i1 = i0 + half; + + uint64_t tw_index = k << (log_n - level - 1); + uint64_t w = tw[tw_index]; + + uint64_t u = x[i0]; + uint64_t v = mul(w, x[i1]); + x[i0] = add(u, v); + x[i1] = sub(u, v); +} + +extern "C" __global__ void ntt_dit_8_levels_batched(uint64_t *data, + const uint64_t *tw, + uint64_t n, + uint64_t log_n, + uint64_t base_step, + uint64_t col_stride) { + __shared__ uint64_t tile[256]; + uint64_t *x = data + (uint64_t)blockIdx.y * col_stride; + + uint32_t n_loc_steps = (uint32_t)min((uint64_t)8, log_n - base_step); + + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + + uint64_t group_size = 1ULL << base_step; + uint64_t n_groups = n >> base_step; + uint64_t low_bits = tid / n_groups; + uint64_t high_bits = tid & (n_groups - 1); + uint64_t row = high_bits * group_size + low_bits; + + tile[threadIdx.x] = x[row]; + __syncthreads(); + + uint32_t remaining_high_bits = (uint32_t)(log_n - base_step - 1); + uint32_t high_mask = (1u << remaining_high_bits) - 1u; + + for (uint32_t loc_step = 0; loc_step < n_loc_steps; ++loc_step) { + if (threadIdx.x < 128) { + uint32_t i = threadIdx.x; + uint32_t half = 1u << loc_step; + uint32_t grp = i >> loc_step; + uint32_t grp_pos = i & (half - 1); + uint32_t idx1 = (grp << (loc_step + 1)) + grp_pos; + uint32_t idx2 = idx1 + half; + + uint32_t gs = (uint32_t)base_step + loc_step; + uint32_t ggp = (blockIdx.x << 7) + i; + ggp = ((ggp & high_mask) << (uint32_t)base_step) + (ggp >> remaining_high_bits); + ggp = ggp & ((1u << gs) - 1u); + uint64_t factor = tw[(uint64_t)ggp * (n >> (gs + 1))]; + + uint64_t u = tile[idx1]; + uint64_t v = mul(tile[idx2], factor); + tile[idx1] = add(u, v); + tile[idx2] = sub(u, v); + } + __syncthreads(); + } + + x[row] = tile[threadIdx.x]; +} + + +/// Batched pointwise multiply: first n elements of each column multiplied by +/// the SHARED weight vector `w` (size n). Used for coset scaling — every +/// column of a table sees the same `g^i / N` weights. +extern "C" __global__ void pointwise_mul_batched(uint64_t *data, + const uint64_t *w, + uint64_t n, + uint64_t col_stride) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + uint64_t *x = data + (uint64_t)blockIdx.y * col_stride; + x[tid] = mul(x[tid], w[tid]); +} + +/// Batched broadcast scalar multiply — one scalar c applied to the first n +/// elements of every column. +extern "C" __global__ void scalar_mul_batched(uint64_t *data, + uint64_t c, + uint64_t n, + uint64_t col_stride) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + uint64_t *x = data + (uint64_t)blockIdx.y * col_stride; + x[tid] = mul(x[tid], c); +} + +/// One DIT butterfly level. Thread `tid` (of n/2 total) owns exactly one +/// butterfly pair (i0, i1 = i0 + half). Twiddle picked from the shared full +/// `tw` table at stride `n / block_size`. Kept for log_n < 8 where shmem +/// fusion is overkill. +extern "C" __global__ void ntt_dit_level(uint64_t *x, + const uint64_t *tw, + uint64_t n, + uint64_t log_n, + uint64_t level) { + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + uint64_t n_half = n >> 1; + if (tid >= n_half) return; + + uint64_t half = 1ULL << level; // 2^ℓ + uint64_t block_size = half << 1; // 2^{ℓ+1} + uint64_t block_idx = tid >> level; // floor(tid / half) + uint64_t k = tid & (half - 1); // tid mod half + + uint64_t i0 = block_idx * block_size + k; + uint64_t i1 = i0 + half; + + // Stride = n / block_size = n >> (level + 1). + uint64_t tw_index = k << (log_n - level - 1); + uint64_t w = tw[tw_index]; + + uint64_t u = x[i0]; + uint64_t v = mul(w, x[i1]); + x[i0] = add(u, v); + x[i1] = sub(u, v); +} + +/// Up to 8 DIT butterfly levels fused in one kernel using shared memory. +/// +/// Ported from Zisk's `br_ntt_8_steps` (`pil2-stark/src/goldilocks/src/ntt_goldilocks.cu`), +/// simplified to single-column. Each block of 256 threads processes 256 +/// elements in on-chip shared memory, running up to 8 butterfly levels +/// without writing to global memory between them — cuts DRAM traffic by up +/// to 8× vs the per-level kernel. +/// +/// `base_step` selects which 8-level window this launch handles (0, 8, 16…). +/// For levels 0–7 the implicit DIT element layout already places all pair +/// mates inside the same 256-block; for higher base_step we remap the loaded +/// row so pair mates land in consecutive shared-memory slots. +/// +/// Expects bit-reversed input (the caller runs `bit_reverse_permute` once +/// before the first kernel launch). +/// +/// Assumes `n` is a multiple of 256, i.e. `log_n >= 8`. +extern "C" __global__ void ntt_dit_8_levels(uint64_t *x, + const uint64_t *tw, + uint64_t n, + uint64_t log_n, + uint64_t base_step) { + __shared__ uint64_t tile[256]; + + uint32_t n_loc_steps = (uint32_t)min((uint64_t)8, log_n - base_step); + + // tid is the *unpermuted* flat index the block/thread would own. + uint64_t tid = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + + // Row remap: for base_step > 0, gather elements that pair at levels + // `base_step..base_step+7` so they land consecutively in the block. + uint64_t group_size = 1ULL << base_step; + uint64_t n_groups = n >> base_step; // = n / group_size + uint64_t low_bits = tid / n_groups; + uint64_t high_bits = tid & (n_groups - 1); + uint64_t row = high_bits * group_size + low_bits; + + // Load one element per thread. + tile[threadIdx.x] = x[row]; + __syncthreads(); + + // Each butterfly level uses half the threads (128 butterflies per block). + // The global butterfly index `ggp` is recovered from blockIdx + threadIdx + // and reshaped by the same row-remap to find the right twiddle. + uint32_t remaining_high_bits = (uint32_t)(log_n - base_step - 1); // log2(n_groups / 2) + uint32_t high_mask = (1u << remaining_high_bits) - 1u; + + for (uint32_t loc_step = 0; loc_step < n_loc_steps; ++loc_step) { + if (threadIdx.x < 128) { + uint32_t i = threadIdx.x; + uint32_t half = 1u << loc_step; + uint32_t grp = i >> loc_step; + uint32_t grp_pos = i & (half - 1); + uint32_t idx1 = (grp << (loc_step + 1)) + grp_pos; + uint32_t idx2 = idx1 + half; + + // Global step and butterfly position for twiddle lookup. + uint32_t gs = (uint32_t)base_step + loc_step; + uint32_t ggp = (blockIdx.x << 7) + i; // blockIdx * 128 + i + // Un-remap ggp to find its position in the natural ordering. + ggp = ((ggp & high_mask) << (uint32_t)base_step) + (ggp >> remaining_high_bits); + ggp = ggp & ((1u << gs) - 1u); + uint64_t factor = tw[(uint64_t)ggp * (n >> (gs + 1))]; + + uint64_t u = tile[idx1]; + uint64_t v = mul(tile[idx2], factor); + tile[idx1] = add(u, v); + tile[idx2] = sub(u, v); + } + __syncthreads(); + } + + // Store back to the remapped row. + x[row] = tile[threadIdx.x]; +} diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs new file mode 100644 index 000000000..2c70716a6 --- /dev/null +++ b/crypto/math-cuda/src/device.rs @@ -0,0 +1,251 @@ +//! CUDA device context, stream pool, kernel handles, and twiddle cache. +//! +//! One process-wide backend — lazy-initialised on first use. All kernels live +//! on a single CUDA context; a pool of streams lets rayon-parallel callers +//! overlap H2D / compute / D2H. + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, OnceLock}; + +use cudarc::driver::{CudaContext, CudaFunction, CudaSlice, CudaStream}; +use cudarc::nvrtc::Ptx; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::IsFFTField; + +use crate::Result; +use crate::ntt::{twiddles_forward, twiddles_inverse}; + +/// Reusable pinned host staging buffer. One per stream; the stream's LDE call +/// holds its buffer's lock across the D2H + memcpy-to-user-Vecs window. +/// +/// Allocated with `cuMemHostAlloc(flags=0)` — portable, non-write-combined, +/// so both DMA writes from device and CPU reads into user Vecs run at full +/// speed. Grows power-of-two; never shrinks. +pub struct PinnedStaging { + ptr: *mut u64, + capacity_elems: usize, +} + +// SAFETY: the raw pointer aliases host memory allocated via cuMemHostAlloc. +// We guard concurrent access with a Mutex; the pointer is valid for the +// lifetime of this struct and is freed on drop. +unsafe impl Send for PinnedStaging {} +unsafe impl Sync for PinnedStaging {} + +impl PinnedStaging { + const fn empty() -> Self { + Self { + ptr: std::ptr::null_mut(), + capacity_elems: 0, + } + } + + pub fn ensure_capacity( + &mut self, + min_elems: usize, + ctx: &CudaContext, + ) -> Result<()> { + if self.capacity_elems >= min_elems { + return Ok(()); + } + // cuMemHostAlloc requires the context to be current on this thread. + ctx.bind_to_thread()?; + // Free old (if any) before allocating the new one. + if !self.ptr.is_null() { + unsafe { + let _ = cudarc::driver::sys::cuMemFreeHost(self.ptr as *mut _); + } + self.ptr = std::ptr::null_mut(); + self.capacity_elems = 0; + } + let new_cap = min_elems.next_power_of_two().max(1 << 20); // at least 8 MB + let bytes = new_cap * std::mem::size_of::(); + let ptr = unsafe { + cudarc::driver::result::malloc_host(bytes, 0 /* flags: non-WC */)? + } as *mut u64; + self.ptr = ptr; + self.capacity_elems = new_cap; + Ok(()) + } + + /// View of the first `len` elements. Caller must hold this `PinnedStaging` + /// locked while using the slice; the slice aliases the internal pointer. + /// + /// # Safety + /// Caller must not outlive the `PinnedStaging` and must not race with + /// concurrent uses. + pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [u64] { + assert!(len <= self.capacity_elems); + if len == 0 { + return &mut []; + } + unsafe { std::slice::from_raw_parts_mut(self.ptr, len) } + } +} + +impl Drop for PinnedStaging { + fn drop(&mut self) { + if !self.ptr.is_null() { + unsafe { + let _ = cudarc::driver::sys::cuMemFreeHost(self.ptr as *mut _); + } + } + } +} + +const ARITH_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/arith.ptx")); +const NTT_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/ntt.ptx")); +/// Number of CUDA streams in the pool. Larger pools let many rayon-parallel +/// callers overlap on the GPU without serializing on stream ownership. The +/// default stream is deliberately excluded because it synchronises with all +/// other streams, defeating the point of the pool. +const STREAM_POOL_SIZE: usize = 32; + +pub struct Backend { + pub ctx: Arc, + streams: Vec>, + /// Single shared pinned staging buffer, grown to the biggest LDE size + /// seen. Concurrent batched LDE calls serialise on it; in exchange the + /// process keeps only ONE gigabyte-sized pinned allocation (per-stream + /// buffers 32×-inflated memory use and multiplied the one-time pinning + /// cost for every first use of a new table size). + pinned_staging: Mutex, + util_stream: Arc, + next: AtomicUsize, + + // arith.ptx + pub vector_add_u64: CudaFunction, + pub gl_add: CudaFunction, + pub gl_sub: CudaFunction, + pub gl_mul: CudaFunction, + pub gl_neg: CudaFunction, + pub ext3_mul: CudaFunction, + pub ext3_add: CudaFunction, + + // ntt.ptx + pub bit_reverse_permute: CudaFunction, + pub ntt_dit_level: CudaFunction, + pub ntt_dit_8_levels: CudaFunction, + pub pointwise_mul: CudaFunction, + pub scalar_mul: CudaFunction, + pub bit_reverse_permute_batched: CudaFunction, + pub ntt_dit_level_batched: CudaFunction, + pub ntt_dit_8_levels_batched: CudaFunction, + pub pointwise_mul_batched: CudaFunction, + pub scalar_mul_batched: CudaFunction, + + // Twiddle caches keyed by log_n. + fwd_twiddles: Mutex>>>>, + inv_twiddles: Mutex>>>>, +} + +impl Backend { + fn init() -> Result { + let ctx = CudaContext::new(0)?; + // cudarc's default per-slice CudaEvent tracking adds two driver calls + // per alloc and serialises under the context lock. We never share + // slices across streams (every call scopes its own buffers and syncs + // before returning), so the tracking is pure overhead. Disable it. + unsafe { ctx.disable_event_tracking() }; + + let arith = ctx.load_module(Ptx::from_src(ARITH_PTX))?; + let ntt = ctx.load_module(Ptx::from_src(NTT_PTX))?; + + let mut streams = Vec::with_capacity(STREAM_POOL_SIZE); + for _ in 0..STREAM_POOL_SIZE { + streams.push(ctx.new_stream()?); + } + let pinned_staging = Mutex::new(PinnedStaging::empty()); + // Separate "utility" stream for twiddle uploads and other bookkeeping; + // not part of the pool that callers rotate through. + let util_stream = ctx.new_stream()?; + + // Goldilocks TWO_ADICITY is 32, so log_n ≤ 32 covers every LDE size + // the prover can produce. Overshoot by one for safety. + let max_log = GoldilocksField::TWO_ADICITY as usize + 1; + + Ok(Self { + vector_add_u64: arith.load_function("vector_add_u64")?, + gl_add: arith.load_function("gl_add_kernel")?, + gl_sub: arith.load_function("gl_sub_kernel")?, + gl_mul: arith.load_function("gl_mul_kernel")?, + gl_neg: arith.load_function("gl_neg_kernel")?, + ext3_mul: arith.load_function("ext3_mul_kernel")?, + ext3_add: arith.load_function("ext3_add_kernel")?, + bit_reverse_permute: ntt.load_function("bit_reverse_permute")?, + ntt_dit_level: ntt.load_function("ntt_dit_level")?, + ntt_dit_8_levels: ntt.load_function("ntt_dit_8_levels")?, + pointwise_mul: ntt.load_function("pointwise_mul")?, + scalar_mul: ntt.load_function("scalar_mul")?, + bit_reverse_permute_batched: ntt.load_function("bit_reverse_permute_batched")?, + ntt_dit_level_batched: ntt.load_function("ntt_dit_level_batched")?, + ntt_dit_8_levels_batched: ntt.load_function("ntt_dit_8_levels_batched")?, + pointwise_mul_batched: ntt.load_function("pointwise_mul_batched")?, + scalar_mul_batched: ntt.load_function("scalar_mul_batched")?, + fwd_twiddles: Mutex::new(vec![None; max_log]), + inv_twiddles: Mutex::new(vec![None; max_log]), + ctx, + streams, + pinned_staging, + util_stream, + next: AtomicUsize::new(0), + }) + } + + /// Round-robin over the stream pool. Concurrent callers get different + /// streams so their kernel launches overlap on the GPU. + pub fn next_stream(&self) -> Arc { + let idx = self.next.fetch_add(1, Ordering::Relaxed) % self.streams.len(); + self.streams[idx].clone() + } + + /// Shared pinned staging buffer. Grows to the largest LDE the process + /// has seen so far. Concurrent callers serialise on the mutex. + pub fn pinned_staging(&self) -> &Mutex { + &self.pinned_staging + } + + pub fn fwd_twiddles_for(&self, log_n: u64) -> Result>> { + self.cached_twiddles(log_n, true) + } + + pub fn inv_twiddles_for(&self, log_n: u64) -> Result>> { + self.cached_twiddles(log_n, false) + } + + fn cached_twiddles(&self, log_n: u64, forward: bool) -> Result>> { + let idx = log_n as usize; + let cache = if forward { + &self.fwd_twiddles + } else { + &self.inv_twiddles + }; + { + let guard = cache.lock().unwrap(); + if let Some(t) = &guard[idx] { + return Ok(t.clone()); + } + } + // Compute on host, upload on the utility stream. Another thread may + // have populated the cache in the meantime; prefer that entry. + let host = if forward { + twiddles_forward(log_n) + } else { + twiddles_inverse(log_n) + }; + let dev = Arc::new(self.util_stream.clone_htod(&host)?); + self.util_stream.synchronize()?; + let mut guard = cache.lock().unwrap(); + if let Some(t) = &guard[idx] { + Ok(t.clone()) + } else { + guard[idx] = Some(dev.clone()); + Ok(dev) + } + } +} + +pub fn backend() -> &'static Backend { + static BACKEND: OnceLock = OnceLock::new(); + BACKEND.get_or_init(|| Backend::init().expect("failed to initialise CUDA backend")) +} diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs new file mode 100644 index 000000000..fd25d1bca --- /dev/null +++ b/crypto/math-cuda/src/lde.rs @@ -0,0 +1,984 @@ +//! Full coset LDE on device. Mirrors `Polynomial::coset_lde_full_expand` in +//! `crypto/math/src/fft/polynomial.rs` algebraically: +//! +//! Input : N evaluations (natural order) of a poly on the standard subgroup, +//! plus coset weights (size N). The weights include the `1/N` iFFT +//! normalisation, matching the `LdeTwiddles::coset_weights` format at +//! `crypto/stark/src/prover.rs:248` — i.e. `weights[i] = g^i / N`. +//! Output : N*blowup_factor evaluations (natural order) on the coset. +//! +//! On-device steps, picks a stream from the shared pool so rayon-parallel +//! callers overlap on the GPU. Twiddles are cached in the backend. + +use std::sync::Arc; + +use cudarc::driver::{CudaSlice, LaunchConfig, PushKernelArg}; + +use crate::Result; +use crate::device::backend; +use crate::ntt::run_ntt_body; + +/// Handle to a base-field LDE kept live on device after R1 commit. +/// Layout: `m` columns, each `lde_size` u64s, column `c` at byte offset +/// `c * lde_size * 8` within `buf`. Freed when `buf` Arc drops. +#[derive(Clone)] +pub struct GpuLdeBase { + pub buf: Arc>, + pub m: usize, + pub lde_size: usize, +} + +/// Handle to an ext3 LDE kept live on device, de-interleaved into 3 base +/// slabs per column. Column `c` component `k` at u64 offset +/// `(c*3 + k) * lde_size` within `buf`. +#[derive(Clone)] +pub struct GpuLdeExt3 { + pub buf: Arc>, + pub m: usize, + pub lde_size: usize, +} + +pub fn coset_lde_base( + evals: &[u64], + blowup_factor: usize, + weights: &[u64], +) -> Result> { + let n = evals.len(); + assert!(n.is_power_of_two(), "evals length must be a power of two"); + assert_eq!(weights.len(), n, "weights length must match evals"); + assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + if n == 0 { + return Ok(Vec::new()); + } + let lde_size = n * blowup_factor; + let log_n = n.trailing_zeros() as u64; + let log_lde = lde_size.trailing_zeros() as u64; + + let be = backend(); + let stream = be.next_stream(); + + // Device buffer of lde_size, zero-padded tail, first N filled by copy. + let mut buf = stream.alloc_zeros::(lde_size)?; + { + let mut head = buf.slice_mut(0..n); + stream.memcpy_htod(evals, &mut head)?; + } + + let inv_tw = be.inv_twiddles_for(log_n)?; + let fwd_tw = be.fwd_twiddles_for(log_lde)?; + let weights_dev = stream.clone_htod(weights)?; + + let n_u64 = n as u64; + let lde_u64 = lde_size as u64; + + // === 1. iNTT on first N: bit_reverse + 8-level-fused DIT body === + unsafe { + stream + .launch_builder(&be.bit_reverse_permute) + .arg(&mut buf) + .arg(&n_u64) + .arg(&log_n) + .launch(LaunchConfig::for_num_elems(n as u32))?; + } + // Note: `run_ntt_body` expects a standalone CudaSlice; we pass `buf` and + // the kernel walks the first `n_u64` elements via its own indexing. + run_ntt_body(stream.as_ref(), &mut buf, inv_tw.as_ref(), n_u64, log_n)?; + // Note: the CPU iFFT does not include 1/N — it's folded into `weights`. The + // next pointwise multiply applies both the coset shift and the 1/N factor. + + // === 2. Pointwise multiply first N by coset weights (includes 1/N) === + unsafe { + stream + .launch_builder(&be.pointwise_mul) + .arg(&mut buf) + .arg(&weights_dev) + .arg(&n_u64) + .launch(LaunchConfig::for_num_elems(n as u32))?; + } + + // === 3. Forward NTT on full buffer === + unsafe { + stream + .launch_builder(&be.bit_reverse_permute) + .arg(&mut buf) + .arg(&lde_u64) + .arg(&log_lde) + .launch(LaunchConfig::for_num_elems(lde_size as u32))?; + } + run_ntt_body(stream.as_ref(), &mut buf, fwd_tw.as_ref(), lde_u64, log_lde)?; + + let out = stream.clone_dtoh(&buf)?; + stream.synchronize()?; + Ok(out) +} + +/// Batched coset LDE: processes `m` columns (all the same domain) in a single +/// pipeline on one stream. One H2D per column, then per-level batched kernels +/// that launch with `grid.y = m` so a single launch does the butterflies for +/// every column at that level. +/// +/// Returns one `Vec` per input column, each of length `n * blowup_factor`. +pub fn coset_lde_batch_base( + columns: &[&[u64]], + blowup_factor: usize, + weights: &[u64], +) -> Result>> { + if columns.is_empty() { + return Ok(Vec::new()); + } + let m = columns.len(); + let n = columns[0].len(); + assert!(n.is_power_of_two(), "column length must be a power of two"); + assert_eq!(weights.len(), n, "weights length must match column length"); + assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + for c in columns.iter() { + assert_eq!(c.len(), n, "all columns must be the same size"); + } + + if n == 0 { + return Ok(vec![Vec::new(); m]); + } + let lde_size = n * blowup_factor; + let log_n = n.trailing_zeros() as u64; + let log_lde = lde_size.trailing_zeros() as u64; + + let be = backend(); + let stream = be.next_stream(); + let staging_slot = be.pinned_staging(); + + let debug_phases = std::env::var("MATH_CUDA_PHASE_TIMING").is_ok(); + let t_start = if debug_phases { Some(std::time::Instant::now()) } else { None }; + let phase = |label: &str, prev: &mut Option| { + if let Some(p) = prev.as_ref() { + let now = std::time::Instant::now(); + eprintln!(" [{:>6.2} ms] {}", (now - *p).as_secs_f64() * 1e3, label); + *prev = Some(now); + } + }; + let mut last = t_start; + + // Pinned staging. Lock and grow to max(m*n for upload, m*lde_size for + // download). Holding the guard across the whole call serialises concurrent + // batched calls that happened to hash to the same stream slot, but that's + // exactly what we want — one stream can only do one sequence at a time. + let mut staging = staging_slot.lock().unwrap(); + staging.ensure_capacity(m * lde_size, &be.ctx)?; + // SAFETY: staging is locked, the slice alias ends before we unlock. + let pinned = unsafe { staging.as_mut_slice(m * lde_size) }; + if debug_phases { phase("staging lock + grow", &mut last); } + + // Pack columns into first m*n slots of the pinned buffer. Parallel: pinned + // writes are DRAM-bandwidth bound, saturates at ~8 cores on modern + // hardware, so rayon shaves 20+ ms at prover scale. + use rayon::prelude::*; + let pinned_base_ptr = pinned.as_mut_ptr() as usize; + columns.par_iter().enumerate().for_each(|(c, col)| { + // SAFETY: each task writes to a disjoint `[c*n..c*n+n]` region of + // `pinned`, and the outer `staging` lock guarantees no other call is + // using the buffer concurrently. + let dst = unsafe { + std::slice::from_raw_parts_mut( + (pinned_base_ptr as *mut u64).add(c * n), + n, + ) + }; + dst.copy_from_slice(col); + }); + if debug_phases { phase("host pack (pinned, rayon)", &mut last); } + + // Column layout: `buf[c * lde_size + r]`. Zeroed so the [n, lde_size) + // tail of each column is already the zero-pad the CPU path does. + let mut buf = stream.alloc_zeros::(m * lde_size)?; + if debug_phases { stream.synchronize()?; phase("alloc_zeros", &mut last); } + // One memcpy per column from the pinned buffer into the strided slots. + // The pinned source hits PCIe line-rate. + for c in 0..m { + let mut dst = buf.slice_mut(c * lde_size..c * lde_size + n); + stream.memcpy_htod(&pinned[c * n..c * n + n], &mut dst)?; + } + if debug_phases { stream.synchronize()?; phase("H2D cols (pinned)", &mut last); } + + let inv_tw = be.inv_twiddles_for(log_n)?; + let fwd_tw = be.fwd_twiddles_for(log_lde)?; + let weights_dev = stream.clone_htod(weights)?; + if debug_phases { stream.synchronize()?; phase("twiddles + weights", &mut last); } + + let n_u64 = n as u64; + let lde_u64 = lde_size as u64; + let col_stride_u64 = lde_size as u64; + let m_u32 = m as u32; + + // === 1. Bit-reverse first N of every column === + { + let grid_x = (n as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, m_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.bit_reverse_permute_batched) + .arg(&mut buf) + .arg(&n_u64) + .arg(&log_n) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + + if debug_phases { stream.synchronize()?; phase("bit_reverse N", &mut last); } + // === 2. iNTT body over all columns === + run_batched_ntt_body( + stream.as_ref(), + &mut buf, + inv_tw.as_ref(), + n_u64, + log_n, + col_stride_u64, + m_u32, + )?; + if debug_phases { stream.synchronize()?; phase("iNTT body", &mut last); } + + // === 3. Pointwise multiply by coset weights (includes 1/N) === + { + let grid_x = (n as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, m_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.pointwise_mul_batched) + .arg(&mut buf) + .arg(&weights_dev) + .arg(&n_u64) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + + // === 4. Bit-reverse full LDE of every column === + { + let grid_x = (lde_size as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, m_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.bit_reverse_permute_batched) + .arg(&mut buf) + .arg(&lde_u64) + .arg(&log_lde) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + + if debug_phases { stream.synchronize()?; phase("pointwise + bit_reverse LDE", &mut last); } + // === 5. Forward NTT on full LDE of every column === + run_batched_ntt_body( + stream.as_ref(), + &mut buf, + fwd_tw.as_ref(), + lde_u64, + log_lde, + col_stride_u64, + m_u32, + )?; + if debug_phases { stream.synchronize()?; phase("forward NTT body", &mut last); } + + // Single big D2H into the reusable pinned staging buffer — pinned, one + // call to the driver, saturates PCIe. + stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?; + stream.synchronize()?; + if debug_phases { phase("D2H (one shot into pinned)", &mut last); } + + // Split pinned → per-column Vecs. The first write to each virgin + // Vec page-faults, which can dominate total time (~75 ms for 128 MB). + // Parallelise so the fault cost spreads across CPU cores. + use rayon::prelude::*; + let pinned_ptr = pinned.as_ptr() as usize; + let out: Vec> = (0..m) + .into_par_iter() + .map(|c| { + let mut v = Vec::::with_capacity(lde_size); + unsafe { v.set_len(lde_size) }; + let src = unsafe { + std::slice::from_raw_parts( + (pinned_ptr as *const u64).add(c * lde_size), + lde_size, + ) + }; + v.copy_from_slice(src); + v + }) + .collect(); + if debug_phases { phase("copy out (rayon pinned → Vecs)", &mut last); } + drop(staging); + Ok(out) +} + +/// Like `coset_lde_batch_base` but writes directly into caller-provided +/// output slices instead of allocating fresh `Vec`s. Each output slice +/// must already have length `n * blowup_factor`. Saves ~50–100 ms of pageable +/// allocator work + page faults at prover scale because the caller's Vecs +/// have been sized once and are reused across calls. +pub fn coset_lde_batch_base_into( + columns: &[&[u64]], + blowup_factor: usize, + weights: &[u64], + outputs: &mut [&mut [u64]], +) -> Result<()> { + if columns.is_empty() { + return Ok(()); + } + let m = columns.len(); + assert_eq!(outputs.len(), m, "outputs must match columns count"); + let n = columns[0].len(); + assert!(n.is_power_of_two(), "column length must be a power of two"); + assert_eq!(weights.len(), n, "weights length must match column length"); + assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + for c in columns.iter() { + assert_eq!(c.len(), n, "all columns must be the same size"); + } + let lde_size = n * blowup_factor; + for o in outputs.iter() { + assert_eq!(o.len(), lde_size, "each output must be lde_size"); + } + if n == 0 { + return Ok(()); + } + let log_n = n.trailing_zeros() as u64; + let log_lde = lde_size.trailing_zeros() as u64; + + let be = backend(); + let stream = be.next_stream(); + let staging_slot = be.pinned_staging(); + + let mut staging = staging_slot.lock().unwrap(); + staging.ensure_capacity(m * lde_size, &be.ctx)?; + let pinned = unsafe { staging.as_mut_slice(m * lde_size) }; + + for (c, col) in columns.iter().enumerate() { + pinned[c * n..c * n + n].copy_from_slice(col); + } + + let mut buf = stream.alloc_zeros::(m * lde_size)?; + for c in 0..m { + let mut dst = buf.slice_mut(c * lde_size..c * lde_size + n); + stream.memcpy_htod(&pinned[c * n..c * n + n], &mut dst)?; + } + + let inv_tw = be.inv_twiddles_for(log_n)?; + let fwd_tw = be.fwd_twiddles_for(log_lde)?; + let weights_dev = stream.clone_htod(weights)?; + + let n_u64 = n as u64; + let lde_u64 = lde_size as u64; + let col_stride_u64 = lde_size as u64; + let m_u32 = m as u32; + + // iNTT bit-reverse + body, pointwise mul, forward bit-reverse + body. + { + let grid_x = (n as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, m_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.bit_reverse_permute_batched) + .arg(&mut buf) + .arg(&n_u64) + .arg(&log_n) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + run_batched_ntt_body( + stream.as_ref(), + &mut buf, + inv_tw.as_ref(), + n_u64, + log_n, + col_stride_u64, + m_u32, + )?; + { + let grid_x = (n as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, m_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.pointwise_mul_batched) + .arg(&mut buf) + .arg(&weights_dev) + .arg(&n_u64) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + { + let grid_x = (lde_size as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, m_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.bit_reverse_permute_batched) + .arg(&mut buf) + .arg(&lde_u64) + .arg(&log_lde) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + run_batched_ntt_body( + stream.as_ref(), + &mut buf, + fwd_tw.as_ref(), + lde_u64, + log_lde, + col_stride_u64, + m_u32, + )?; + + stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?; + stream.synchronize()?; + + // Parallel copy pinned → caller outputs. Caller's Vecs may still fault + // on first write; we spread that cost across rayon cores. + #[allow(unused_imports)] + use rayon::prelude::*; + let pinned_ptr = pinned.as_ptr() as usize; + outputs + .par_iter_mut() + .enumerate() + .for_each(|(c, dst)| { + let src = unsafe { + std::slice::from_raw_parts( + (pinned_ptr as *const u64).add(c * lde_size), + lde_size, + ) + }; + dst.copy_from_slice(src); + }); + drop(staging); + Ok(()) +} + +/// Batched ext3 polynomial → coset evaluation. +/// +/// Input: M ext3 columns of `n` coefficients each (interleaved, 3n u64). +/// Output: M ext3 columns of `n * blowup_factor` evaluations each at the +/// offset-coset. +/// +/// Skips the iFFT stage of [`coset_lde_batch_ext3_into`] (input is +/// coefficients, not evaluations). Weights encode the coset shift: +/// `weights[k] = offset^k` (NO 1/N because iFFT normalisation doesn't apply). +/// +/// Used by the stark prover to GPU-accelerate +/// `evaluate_polynomial_on_lde_domain` calls inside the +/// `number_of_parts > 2` branch of the composition-polynomial LDE. +pub fn evaluate_poly_coset_batch_ext3_into( + coefs: &[&[u64]], + n: usize, + blowup_factor: usize, + weights: &[u64], + outputs: &mut [&mut [u64]], +) -> Result<()> { + evaluate_poly_coset_batch_ext3_into_inner( + coefs, + n, + blowup_factor, + weights, + outputs, + false, + ) + .map(|_| ()) +} + +/// Same as [`evaluate_poly_coset_batch_ext3_into`] but retains the de- +/// interleaved LDE device buffer as a `GpuLdeExt3` handle. Lets R2 commit +/// and R4 DEEP composition read the composition-parts LDE without +/// re-H2D'ing. +pub fn evaluate_poly_coset_batch_ext3_into_keep( + coefs: &[&[u64]], + n: usize, + blowup_factor: usize, + weights: &[u64], + outputs: &mut [&mut [u64]], +) -> Result { + let opt = evaluate_poly_coset_batch_ext3_into_inner( + coefs, + n, + blowup_factor, + weights, + outputs, + true, + )?; + Ok(opt.expect("keep_device_buf=true must return Some")) +} + +fn evaluate_poly_coset_batch_ext3_into_inner( + coefs: &[&[u64]], + n: usize, + blowup_factor: usize, + weights: &[u64], + outputs: &mut [&mut [u64]], + keep_device_buf: bool, +) -> Result> { + if coefs.is_empty() { + assert_eq!(outputs.len(), 0); + return Ok(None); + } + let m = coefs.len(); + assert_eq!(outputs.len(), m); + assert!(n.is_power_of_two()); + assert_eq!(weights.len(), n); + assert!(blowup_factor.is_power_of_two()); + for c in coefs.iter() { + assert_eq!(c.len(), 3 * n); + } + let lde_size = n * blowup_factor; + for o in outputs.iter() { + assert_eq!(o.len(), 3 * lde_size); + } + if n == 0 { + return Ok(None); + } + let log_lde = lde_size.trailing_zeros() as u64; + + let mb = 3 * m; + let be = backend(); + let stream = be.next_stream(); + let staging_slot = be.pinned_staging(); + + let mut staging = staging_slot.lock().unwrap(); + staging.ensure_capacity(mb * lde_size, &be.ctx)?; + let pinned = unsafe { staging.as_mut_slice(mb * lde_size) }; + + use rayon::prelude::*; + let pinned_ptr_u = pinned.as_mut_ptr() as usize; + coefs.par_iter().enumerate().for_each(|(c, col)| { + let slab_a = unsafe { + std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3) * n), n) + }; + let slab_b = unsafe { + std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 1) * n), n) + }; + let slab_c = unsafe { + std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 2) * n), n) + }; + for i in 0..n { + slab_a[i] = col[i * 3]; + slab_b[i] = col[i * 3 + 1]; + slab_c[i] = col[i * 3 + 2]; + } + }); + + let mut buf = stream.alloc_zeros::(mb * lde_size)?; + for s in 0..mb { + let mut dst = buf.slice_mut(s * lde_size..s * lde_size + n); + stream.memcpy_htod(&pinned[s * n..s * n + n], &mut dst)?; + } + + let fwd_tw = be.fwd_twiddles_for(log_lde)?; + let weights_dev = stream.clone_htod(weights)?; + + let n_u64 = n as u64; + let lde_u64 = lde_size as u64; + let col_stride_u64 = lde_size as u64; + let mb_u32 = mb as u32; + + // Apply coset scaling: x[k] *= weights[k] for k in 0..n (no iFFT first). + { + let grid_x = (n as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, mb_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.pointwise_mul_batched) + .arg(&mut buf) + .arg(&weights_dev) + .arg(&n_u64) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + + // Bit-reverse full lde_size slab, then forward DIT NTT. + { + let grid_x = (lde_size as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, mb_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.bit_reverse_permute_batched) + .arg(&mut buf) + .arg(&lde_u64) + .arg(&log_lde) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + run_batched_ntt_body( + stream.as_ref(), + &mut buf, + fwd_tw.as_ref(), + lde_u64, + log_lde, + col_stride_u64, + mb_u32, + )?; + + stream.memcpy_dtoh(&buf, &mut pinned[..mb * lde_size])?; + stream.synchronize()?; + + let pinned_const = pinned.as_ptr() as usize; + outputs.par_iter_mut().enumerate().for_each(|(c, dst)| { + let slab_a = unsafe { + std::slice::from_raw_parts( + (pinned_const as *const u64).add((c * 3) * lde_size), + lde_size, + ) + }; + let slab_b = unsafe { + std::slice::from_raw_parts( + (pinned_const as *const u64).add((c * 3 + 1) * lde_size), + lde_size, + ) + }; + let slab_c = unsafe { + std::slice::from_raw_parts( + (pinned_const as *const u64).add((c * 3 + 2) * lde_size), + lde_size, + ) + }; + for i in 0..lde_size { + dst[i * 3] = slab_a[i]; + dst[i * 3 + 1] = slab_b[i]; + dst[i * 3 + 2] = slab_c[i]; + } + }); + drop(staging); + if keep_device_buf { + Ok(Some(GpuLdeExt3 { + buf: std::sync::Arc::new(buf), + m, + lde_size, + })) + } else { + drop(buf); + Ok(None) + } +} + +/// Batched coset LDE for Goldilocks **cubic extension** columns. +/// +/// A degree-3 extension element is `(a, b, c)` in memory (three contiguous +/// u64s). The NTT butterfly multiplies `v = (a, b, c)` by a base-field +/// twiddle `t`: `t * v = (t*a, t*b, t*c)`. Addition is componentwise. So an +/// NTT over M ext3 columns is algebraically equivalent to **3M parallel +/// base-field NTTs** sharing the same twiddles and coset weights. We +/// exploit this to reuse the base-field kernels with no modification: +/// +/// 1. Host pack de-interleaves each ext3 column into 3 consecutive +/// base-field slabs inside the pinned staging buffer (slab 0 has all the +/// a-components, slab 1 all the b's, slab 2 all the c's — 3M base slabs +/// in total). +/// 2. Existing `bit_reverse_permute_batched` / `ntt_dit_*_batched` / +/// `pointwise_mul_batched` run over those 3M base slabs on device. +/// 3. D2H, then re-interleave 3 slabs per output ext3 column. +/// +/// Input/output layout: each slice is 3*n or 3*n*blowup u64s, packed as +/// `[a0, b0, c0, a1, b1, c1, ...]` — the natural `[FieldElement]` +/// memory representation. +pub fn coset_lde_batch_ext3_into( + columns: &[&[u64]], + n: usize, + blowup_factor: usize, + weights: &[u64], + outputs: &mut [&mut [u64]], +) -> Result<()> { + if columns.is_empty() { + return Ok(()); + } + let m = columns.len(); + assert_eq!(outputs.len(), m, "outputs must match columns count"); + assert!(n.is_power_of_two(), "n must be a power of two"); + assert_eq!(weights.len(), n, "weights length must match n"); + assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + for c in columns.iter() { + assert_eq!(c.len(), 3 * n, "each ext3 column must be 3*n u64s"); + } + let lde_size = n * blowup_factor; + for o in outputs.iter() { + assert_eq!(o.len(), 3 * lde_size, "each output must be 3*lde_size u64s"); + } + if n == 0 { + return Ok(()); + } + let log_n = n.trailing_zeros() as u64; + let log_lde = lde_size.trailing_zeros() as u64; + + // 3 base slabs per ext3 column; slab index `c*3 + k` holds component `k`. + let mb = 3 * m; + + let be = backend(); + let stream = be.next_stream(); + let staging_slot = be.pinned_staging(); + + let mut staging = staging_slot.lock().unwrap(); + staging.ensure_capacity(mb * lde_size, &be.ctx)?; + let pinned = unsafe { staging.as_mut_slice(mb * lde_size) }; + + // Pack: for each ext3 column, write 3 base slabs into pinned. The slab + // for column c, component k lives at `pinned[(c*3 + k)*n .. (c*3+k)*n + n]`. + // We de-interleave from the interleaved `[a, b, c, a, b, c, ...]` input. + use rayon::prelude::*; + let pinned_ptr_u = pinned.as_mut_ptr() as usize; + columns.par_iter().enumerate().for_each(|(c, col)| { + // SAFETY: disjoint regions per c; staging lock held. + let slab_a = unsafe { + std::slice::from_raw_parts_mut( + (pinned_ptr_u as *mut u64).add((c * 3 + 0) * n), + n, + ) + }; + let slab_b = unsafe { + std::slice::from_raw_parts_mut( + (pinned_ptr_u as *mut u64).add((c * 3 + 1) * n), + n, + ) + }; + let slab_c = unsafe { + std::slice::from_raw_parts_mut( + (pinned_ptr_u as *mut u64).add((c * 3 + 2) * n), + n, + ) + }; + for i in 0..n { + slab_a[i] = col[i * 3 + 0]; + slab_b[i] = col[i * 3 + 1]; + slab_c[i] = col[i * 3 + 2]; + } + }); + + // Allocate + zero-pad device buffer holding 3M slabs of `lde_size`. + let mut buf = stream.alloc_zeros::(mb * lde_size)?; + // H2D: slab by slab into the first N slots of each `lde_size`-slab. + for s in 0..mb { + let mut dst = buf.slice_mut(s * lde_size..s * lde_size + n); + stream.memcpy_htod(&pinned[s * n..s * n + n], &mut dst)?; + } + + let inv_tw = be.inv_twiddles_for(log_n)?; + let fwd_tw = be.fwd_twiddles_for(log_lde)?; + let weights_dev = stream.clone_htod(weights)?; + + let n_u64 = n as u64; + let lde_u64 = lde_size as u64; + let col_stride_u64 = lde_size as u64; + let mb_u32 = mb as u32; + + // === Butterflies: identical to the base-field batched path, but with + // grid.y = 3M instead of M. === + { + let grid_x = (n as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, mb_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.bit_reverse_permute_batched) + .arg(&mut buf) + .arg(&n_u64) + .arg(&log_n) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + run_batched_ntt_body( + stream.as_ref(), + &mut buf, + inv_tw.as_ref(), + n_u64, + log_n, + col_stride_u64, + mb_u32, + )?; + { + let grid_x = (n as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, mb_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.pointwise_mul_batched) + .arg(&mut buf) + .arg(&weights_dev) + .arg(&n_u64) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + { + let grid_x = (lde_size as u32).div_ceil(256); + let cfg = LaunchConfig { + grid_dim: (grid_x, mb_u32, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.bit_reverse_permute_batched) + .arg(&mut buf) + .arg(&lde_u64) + .arg(&log_lde) + .arg(&col_stride_u64) + .launch(cfg)?; + } + } + run_batched_ntt_body( + stream.as_ref(), + &mut buf, + fwd_tw.as_ref(), + lde_u64, + log_lde, + col_stride_u64, + mb_u32, + )?; + + stream.memcpy_dtoh(&buf, &mut pinned[..mb * lde_size])?; + stream.synchronize()?; + + // Unpack: for each output column, re-interleave 3 slabs back into the + // ext3-per-element layout. + let pinned_const = pinned.as_ptr() as usize; + outputs.par_iter_mut().enumerate().for_each(|(c, dst)| { + let slab_a = unsafe { + std::slice::from_raw_parts( + (pinned_const as *const u64).add((c * 3 + 0) * lde_size), + lde_size, + ) + }; + let slab_b = unsafe { + std::slice::from_raw_parts( + (pinned_const as *const u64).add((c * 3 + 1) * lde_size), + lde_size, + ) + }; + let slab_c = unsafe { + std::slice::from_raw_parts( + (pinned_const as *const u64).add((c * 3 + 2) * lde_size), + lde_size, + ) + }; + for i in 0..lde_size { + dst[i * 3 + 0] = slab_a[i]; + dst[i * 3 + 1] = slab_b[i]; + dst[i * 3 + 2] = slab_c[i]; + } + }); + drop(staging); + Ok(()) +} + +/// Run the DIT butterfly body of a bit-reversed-input NTT over `m` batched +/// columns in one device buffer. Same fusion strategy as `run_ntt_body`: +/// first 8 levels shmem-fused (coalesced), subsequent levels one kernel each. +fn run_batched_ntt_body( + stream: &cudarc::driver::CudaStream, + x_dev: &mut cudarc::driver::CudaSlice, + tw_dev: &cudarc::driver::CudaSlice, + n: u64, + log_n: u64, + col_stride: u64, + m: u32, +) -> Result<()> { + let be = backend(); + let fused = core::cmp::min(log_n, 8); + if fused >= 8 { + let grid_x = (n / 256) as u32; + let cfg = LaunchConfig { + grid_dim: (grid_x, m, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + let base_step = 0u64; + unsafe { + stream + .launch_builder(&be.ntt_dit_8_levels_batched) + .arg(&mut *x_dev) + .arg(tw_dev) + .arg(&n) + .arg(&log_n) + .arg(&base_step) + .arg(&col_stride) + .launch(cfg)?; + } + } else { + let grid_x = ((n / 2) as u32).div_ceil(256).max(1); + let cfg = LaunchConfig { + grid_dim: (grid_x, m, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + for level in 0..fused { + unsafe { + stream + .launch_builder(&be.ntt_dit_level_batched) + .arg(&mut *x_dev) + .arg(tw_dev) + .arg(&n) + .arg(&log_n) + .arg(&level) + .arg(&col_stride) + .launch(cfg)?; + } + } + } + + let grid_x = ((n / 2) as u32).div_ceil(256).max(1); + let cfg = LaunchConfig { + grid_dim: (grid_x, m, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + for level in fused..log_n { + unsafe { + stream + .launch_builder(&be.ntt_dit_level_batched) + .arg(&mut *x_dev) + .arg(tw_dev) + .arg(&n) + .arg(&log_n) + .arg(&level) + .arg(&col_stride) + .launch(cfg)?; + } + } + Ok(()) +} + diff --git a/crypto/math-cuda/src/lib.rs b/crypto/math-cuda/src/lib.rs new file mode 100644 index 000000000..821f5bd3a --- /dev/null +++ b/crypto/math-cuda/src/lib.rs @@ -0,0 +1,152 @@ +//! GPU backend for the lambda-vm STARK prover. +//! +//! Primary entry point: [`lde::coset_lde_base`]. Everything else (`ntt`, +//! element-wise arith) is either internal to the LDE pipeline or used by the +//! parity test suite. + +pub mod device; +pub mod lde; +pub mod ntt; + +use cudarc::driver::{LaunchConfig, PushKernelArg}; + +use crate::device::{Backend, backend}; + +pub type Result = std::result::Result; + +/// Toolchain sanity: plain wrapping u64 vector add. Not a field op. +pub fn vector_add_u64(a: &[u64], b: &[u64]) -> Result> { + launch_binary_u64(a, b, |be| &be.vector_add_u64) +} + +/// Goldilocks field add on device, element-wise. Inputs may be non-canonical. +pub fn gl_add_u64(a: &[u64], b: &[u64]) -> Result> { + launch_binary_u64(a, b, |be| &be.gl_add) +} + +pub fn gl_sub_u64(a: &[u64], b: &[u64]) -> Result> { + launch_binary_u64(a, b, |be| &be.gl_sub) +} + +pub fn gl_mul_u64(a: &[u64], b: &[u64]) -> Result> { + launch_binary_u64(a, b, |be| &be.gl_mul) +} + +pub fn gl_neg_u64(a: &[u64]) -> Result> { + let n = a.len(); + if n == 0 { + return Ok(Vec::new()); + } + let be = backend(); + let stream = be.next_stream(); + + let a_dev = stream.clone_htod(a)?; + let mut c_dev = stream.alloc_zeros::(n)?; + + let cfg = LaunchConfig::for_num_elems(n as u32); + let n_u64 = n as u64; + unsafe { + stream + .launch_builder(&be.gl_neg) + .arg(&a_dev) + .arg(&mut c_dev) + .arg(&n_u64) + .launch(cfg)?; + } + + let out = stream.clone_dtoh(&c_dev)?; + stream.synchronize()?; + Ok(out) +} + +/// Element-wise ext3 multiply. `a` and `b` are 3n u64s (interleaved +/// [a0,a1,a2,b0,b1,b2,...]). Test helper for the `ext3.cuh` header. +pub fn ext3_mul_u64(a: &[u64], b: &[u64]) -> Result> { + assert_eq!(a.len(), b.len()); + assert_eq!(a.len() % 3, 0); + let n = a.len() / 3; + if n == 0 { + return Ok(Vec::new()); + } + let be = backend(); + let stream = be.next_stream(); + let a_dev = stream.clone_htod(a)?; + let b_dev = stream.clone_htod(b)?; + let mut c_dev = stream.alloc_zeros::(3 * n)?; + let cfg = LaunchConfig::for_num_elems(n as u32); + let n_u64 = n as u64; + unsafe { + stream + .launch_builder(&be.ext3_mul) + .arg(&a_dev) + .arg(&b_dev) + .arg(&mut c_dev) + .arg(&n_u64) + .launch(cfg)?; + } + let out = stream.clone_dtoh(&c_dev)?; + stream.synchronize()?; + Ok(out) +} + +/// Element-wise ext3 add. +pub fn ext3_add_u64(a: &[u64], b: &[u64]) -> Result> { + assert_eq!(a.len(), b.len()); + assert_eq!(a.len() % 3, 0); + let n = a.len() / 3; + if n == 0 { + return Ok(Vec::new()); + } + let be = backend(); + let stream = be.next_stream(); + let a_dev = stream.clone_htod(a)?; + let b_dev = stream.clone_htod(b)?; + let mut c_dev = stream.alloc_zeros::(3 * n)?; + let cfg = LaunchConfig::for_num_elems(n as u32); + let n_u64 = n as u64; + unsafe { + stream + .launch_builder(&be.ext3_add) + .arg(&a_dev) + .arg(&b_dev) + .arg(&mut c_dev) + .arg(&n_u64) + .launch(cfg)?; + } + let out = stream.clone_dtoh(&c_dev)?; + stream.synchronize()?; + Ok(out) +} + +fn launch_binary_u64(a: &[u64], b: &[u64], pick: F) -> Result> +where + F: for<'a> Fn(&'a Backend) -> &'a cudarc::driver::CudaFunction, +{ + assert_eq!(a.len(), b.len(), "length mismatch"); + let n = a.len(); + if n == 0 { + return Ok(Vec::new()); + } + let be = backend(); + let stream = be.next_stream(); + + let a_dev = stream.clone_htod(a)?; + let b_dev = stream.clone_htod(b)?; + let mut c_dev = stream.alloc_zeros::(n)?; + + let cfg = LaunchConfig::for_num_elems(n as u32); + let n_u64 = n as u64; + unsafe { + stream + .launch_builder(pick(be)) + .arg(&a_dev) + .arg(&b_dev) + .arg(&mut c_dev) + .arg(&n_u64) + .launch(cfg)?; + } + + let out = stream.clone_dtoh(&c_dev)?; + stream.synchronize()?; + Ok(out) +} diff --git a/crypto/math-cuda/src/ntt.rs b/crypto/math-cuda/src/ntt.rs new file mode 100644 index 000000000..0ebb015ea --- /dev/null +++ b/crypto/math-cuda/src/ntt.rs @@ -0,0 +1,211 @@ +//! Forward and inverse NTT over Goldilocks base field. Matches the algebraic +//! contract of `math::polynomial::Polynomial::evaluate_fft` / +//! `interpolate_fft`: +//! input = n elements in natural order +//! output = n elements in natural order. +//! +//! Parity is checked by `tests/ntt.rs` against the CPU implementation. + +use cudarc::driver::{LaunchConfig, PushKernelArg}; +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsFFTField, IsField}; + +use crate::Result; +use crate::device::backend; + +/// Host-side twiddle table: `[ω^0, ω^1, …, ω^{n/2-1}]` where ω is the +/// primitive n-th root of unity. Exposed for `device::Backend::cached_twiddles` +/// and for direct use in tests / benches. +pub fn twiddles_forward(log_n: u64) -> Vec { + let omega = *GoldilocksField::get_primitive_root_of_unity(log_n) + .expect("primitive root") + .value(); + powers_of(omega, 1usize << (log_n - 1)) +} + +/// Inverse twiddle table: `[ω^{-i}]` for i in [0, n/2). +pub fn twiddles_inverse(log_n: u64) -> Vec { + let omega = GoldilocksField::get_primitive_root_of_unity(log_n).expect("primitive root"); + let omega_inv = FieldElement::::inv(&omega).expect("inverse"); + powers_of(*omega_inv.value(), 1usize << (log_n - 1)) +} + +fn powers_of(base: u64, count: usize) -> Vec { + let mut out = Vec::with_capacity(count); + let mut w = 1u64; + for _ in 0..count { + out.push(w); + w = GoldilocksField::mul(&w, &base); + } + out +} + +/// Forward NTT on a slice of `n = 2^log_n` Goldilocks coefficients. Takes +/// natural-order input and returns natural-order evaluations. +pub fn forward(coeffs: &[u64]) -> Result> { + ntt_inplace(coeffs, /*forward=*/ true) +} + +/// Inverse NTT on a slice of `n = 2^log_n` Goldilocks evaluations. Takes +/// natural-order evaluations and returns natural-order coefficients. Includes +/// the 1/n scaling. +pub fn inverse(evals: &[u64]) -> Result> { + ntt_inplace(evals, /*forward=*/ false) +} + +fn ntt_inplace(input: &[u64], forward: bool) -> Result> { + let n = input.len(); + assert!(n.is_power_of_two(), "ntt length must be a power of two"); + if n <= 1 { + return Ok(input.to_vec()); + } + let log_n = n.trailing_zeros() as u64; + + let be = backend(); + let stream = be.next_stream(); + + let mut x_dev = stream.clone_htod(input)?; + let tw_dev = if forward { + be.fwd_twiddles_for(log_n)? + } else { + be.inv_twiddles_for(log_n)? + }; + + let n_u64 = n as u64; + + // 1. Bit-reverse: natural → bit-reversed. + unsafe { + stream + .launch_builder(&be.bit_reverse_permute) + .arg(&mut x_dev) + .arg(&n_u64) + .arg(&log_n) + .launch(LaunchConfig::for_num_elems(n as u32))?; + } + + // 2. DIT butterfly levels. For log_n >= 8 we fuse 8 levels per kernel via + // the shmem kernel; for very small sizes (< 256 elements) we stick with + // the per-level kernel because the shmem block dimensions assume n ≥ 256. + run_ntt_body( + stream.as_ref(), + &mut x_dev, + tw_dev.as_ref(), + n_u64, + log_n, + )?; + + // 3. For iNTT, multiply by 1/n. + if !forward { + let n_fe = FieldElement::::from(n as u64); + let inv_n = *n_fe.inv().expect("n is non-zero").value(); + unsafe { + stream + .launch_builder(&be.scalar_mul) + .arg(&mut x_dev) + .arg(&inv_n) + .arg(&n_u64) + .launch(LaunchConfig::for_num_elems(n as u32))?; + } + } + + let out = stream.clone_dtoh(&x_dev)?; + stream.synchronize()?; + Ok(out) +} + +/// Run the butterfly body of a bit-reversed-input DIT NTT. Split out so the +/// LDE orchestrator can reuse it on the same device buffer. +pub(crate) fn run_ntt_body( + stream: &cudarc::driver::CudaStream, + x_dev: &mut cudarc::driver::CudaSlice, + tw_dev: &cudarc::driver::CudaSlice, + n: u64, + log_n: u64, +) -> Result<()> { + let be = backend(); + // Levels 0..min(log_n, 8): one shmem-fused launch. Loads are fully + // coalesced (base_step=0 → `row = tid`) and 8 butterfly rounds stay on + // chip. This is the big DRAM-bandwidth win. + let fused = core::cmp::min(log_n, 8); + if fused >= 8 { + let grid_x = (n / 256) as u32; + let cfg = LaunchConfig { + grid_dim: (grid_x, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + let base_step = 0u64; + unsafe { + stream + .launch_builder(&be.ntt_dit_8_levels) + .arg(&mut *x_dev) + .arg(tw_dev) + .arg(&n) + .arg(&log_n) + .arg(&base_step) + .launch(cfg)?; + } + } else { + // Sub-256-element NTT. Use per-level. + let half_cfg = LaunchConfig::for_num_elems((n / 2) as u32); + for level in 0..fused { + unsafe { + stream + .launch_builder(&be.ntt_dit_level) + .arg(&mut *x_dev) + .arg(tw_dev) + .arg(&n) + .arg(&log_n) + .arg(&level) + .launch(half_cfg)?; + } + } + } + + // Levels 8..log_n: per-level kernels. Loads are fully coalesced in the + // per-level path; switching to fused-with-row-remap at base_step>0 tanks + // DRAM throughput enough to wipe out the launch savings. + let half_cfg = LaunchConfig::for_num_elems((n / 2) as u32); + for level in fused..log_n { + unsafe { + stream + .launch_builder(&be.ntt_dit_level) + .arg(&mut *x_dev) + .arg(tw_dev) + .arg(&n) + .arg(&log_n) + .arg(&level) + .launch(half_cfg)?; + } + } + Ok(()) +} + +/// Pointwise multiply: `x[i] *= w[i]`. +pub fn pointwise_mul(x: &[u64], w: &[u64]) -> Result> { + assert_eq!(x.len(), w.len()); + let n = x.len(); + if n == 0 { + return Ok(Vec::new()); + } + let be = backend(); + let stream = be.next_stream(); + + let mut x_dev = stream.clone_htod(x)?; + let w_dev = stream.clone_htod(w)?; + + let n_u64 = n as u64; + unsafe { + stream + .launch_builder(&be.pointwise_mul) + .arg(&mut x_dev) + .arg(&w_dev) + .arg(&n_u64) + .launch(LaunchConfig::for_num_elems(n as u32))?; + } + + let out = stream.clone_dtoh(&x_dev)?; + stream.synchronize()?; + Ok(out) +} diff --git a/crypto/math-cuda/tests/bench_quick.rs b/crypto/math-cuda/tests/bench_quick.rs new file mode 100644 index 000000000..561331b74 --- /dev/null +++ b/crypto/math-cuda/tests/bench_quick.rs @@ -0,0 +1,356 @@ +//! Informal timing comparison for single-column and multi-column LDE. +//! Ignored by default; run with `cargo test ... -- --ignored --nocapture`. + +use std::time::Instant; + +use math::fft::cpu::bowers_fft::LayerTwiddles; +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::IsField; +use math::polynomial::Polynomial; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use rayon::prelude::*; + +type Fp = FieldElement; + +fn coset_weights(n: usize, g: u64) -> Vec { + let inv_n = *FieldElement::::from(n as u64).inv().unwrap().value(); + let mut w = Vec::with_capacity(n); + let mut cur = inv_n; + for _ in 0..n { + w.push(cur); + cur = GoldilocksField::mul(&cur, &g); + } + w +} + +#[test] +#[ignore = "informal perf probe; run with --ignored"] +fn bench_lde_2_to_18_blowup_4() { + let log_n = 18; + let blowup = 4; + let n = 1usize << log_n; + let lde = n * blowup; + let mut rng = ChaCha8Rng::seed_from_u64(1); + let input: Vec = (0..n).map(|_| rng.r#gen::()).collect(); + let weights = coset_weights(n, 7); + + let _ = math_cuda::lde::coset_lde_base(&input, blowup, &weights).unwrap(); + + let inv_tw = LayerTwiddles::::new_inverse(log_n as u64).unwrap(); + let fwd_tw = LayerTwiddles::::new(lde.trailing_zeros() as u64).unwrap(); + let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); + + const TRIALS: u32 = 10; + + let t0 = Instant::now(); + for _ in 0..TRIALS { + let _ = math_cuda::lde::coset_lde_base(&input, blowup, &weights).unwrap(); + } + let gpu_ns = t0.elapsed().as_nanos() / TRIALS as u128; + + let t0 = Instant::now(); + for _ in 0..TRIALS { + let mut buf: Vec = input.iter().map(|&x| Fp::from_raw(x)).collect(); + Polynomial::coset_lde_full_expand::( + &mut buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + ) + .unwrap(); + std::hint::black_box(&buf); + } + let cpu_ns = t0.elapsed().as_nanos() / TRIALS as u128; + + let ratio = cpu_ns as f64 / gpu_ns as f64; + println!( + "single-column LDE 2^{log_n} blowup={blowup}: cpu={cpu_ns}ns gpu={gpu_ns}ns ratio={ratio:.2}x", + ); +} + +#[test] +#[ignore = "informal perf probe; run with --ignored"] +fn bench_lde_2_to_16_blowup_4() { + let log_n = 16; + let blowup = 4; + let n = 1usize << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(2); + let input: Vec = (0..n).map(|_| rng.r#gen::()).collect(); + let weights = coset_weights(n, 7); + + let _ = math_cuda::lde::coset_lde_base(&input, blowup, &weights).unwrap(); + + const TRIALS: u32 = 20; + + let t0 = Instant::now(); + for _ in 0..TRIALS { + let _ = math_cuda::lde::coset_lde_base(&input, blowup, &weights).unwrap(); + } + let gpu_ns = t0.elapsed().as_nanos() / TRIALS as u128; + println!("single-column LDE 2^{log_n} blowup={blowup}: gpu={gpu_ns}ns"); +} + +#[test] +#[ignore = "informal perf probe; run with --ignored"] +fn bench_lde_multi_column_parallel() { + // Simulates the prover's Phase A: many columns processed via rayon. + // log_n = 16 keeps memory footprint manageable while still stressing streams. + let log_n = 16u32; + let blowup = 4usize; + let n = 1usize << log_n; + let lde = n * blowup; + let num_cols = 64; + + // Warm up. + let _ = math_cuda::lde::coset_lde_base( + &vec![0u64; n], + blowup, + &coset_weights(n, 7), + ) + .unwrap(); + + // Build input data. + let mut rng = ChaCha8Rng::seed_from_u64(11); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| rng.r#gen::()).collect()) + .collect(); + let weights = coset_weights(n, 7); + let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); + let inv_tw = LayerTwiddles::::new_inverse(log_n as u64).unwrap(); + let fwd_tw = LayerTwiddles::::new(lde.trailing_zeros() as u64).unwrap(); + + // GPU: rayon parallel across columns, each column picks a stream. + let t0 = Instant::now(); + let _gpu_results: Vec> = columns + .par_iter() + .map(|col| math_cuda::lde::coset_lde_base(col, blowup, &weights).unwrap()) + .collect(); + let gpu_ns = t0.elapsed().as_nanos(); + + // CPU: same rayon parallel pattern as the prover's `expand_columns_to_lde`. + let mut cpu_bufs: Vec> = columns + .iter() + .map(|c| c.iter().map(|&x| Fp::from_raw(x)).collect()) + .collect(); + let t0 = Instant::now(); + cpu_bufs.par_iter_mut().for_each(|buf| { + Polynomial::coset_lde_full_expand::( + buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + ) + .unwrap(); + }); + let cpu_ns = t0.elapsed().as_nanos(); + + let ratio = cpu_ns as f64 / gpu_ns as f64; + println!( + "{num_cols}-column LDE 2^{log_n} blowup={blowup}: cpu={cpu_ns}ns gpu={gpu_ns}ns ratio={ratio:.2}x (cores={})", + rayon::current_num_threads(), + ); +} + +#[test] +#[ignore = "informal perf probe; run with --ignored"] +fn bench_lde_batched_prover_scale() { + // Realistic large-table shape from the 1M-fib prover: ~1M rows, blowup 4, + // a few dozen columns. This is what actually runs in expand_columns_to_lde. + let log_n = 20u32; // 1M rows + let blowup = 4usize; + let n = 1usize << log_n; + let num_cols = 20; + + let mut rng = ChaCha8Rng::seed_from_u64(31); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| rng.r#gen::()).collect()) + .collect(); + let weights = coset_weights(n, 7); + let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); + let inv_tw = LayerTwiddles::::new_inverse(log_n as u64).unwrap(); + let fwd_tw = LayerTwiddles::::new( + (n * blowup).trailing_zeros() as u64, + ) + .unwrap(); + + let warm_slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); + for _ in 0..8 { + let _ = + math_cuda::lde::coset_lde_batch_base(&warm_slices, blowup, &weights).unwrap(); + } + + let slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); + let mut gpu_samples = Vec::with_capacity(10); + for _ in 0..10 { + let t0 = Instant::now(); + let _ = math_cuda::lde::coset_lde_batch_base(&slices, blowup, &weights).unwrap(); + gpu_samples.push(t0.elapsed().as_nanos()); + } + gpu_samples.sort(); + let gpu_ns = gpu_samples[gpu_samples.len() / 2]; // median + + let mut cpu_samples = Vec::with_capacity(10); + for _ in 0..10 { + let mut cpu_bufs: Vec> = columns + .iter() + .map(|c| c.iter().map(|&x| Fp::from_raw(x)).collect()) + .collect(); + let t0 = Instant::now(); + cpu_bufs.par_iter_mut().for_each(|buf| { + Polynomial::coset_lde_full_expand::( + buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + ) + .unwrap(); + }); + cpu_samples.push(t0.elapsed().as_nanos()); + } + cpu_samples.sort(); + let cpu_ns = cpu_samples[cpu_samples.len() / 2]; // median + + let ratio = cpu_ns as f64 / gpu_ns as f64; + println!( + "prover-scale batched {num_cols} cols, log_n={log_n}, blowup={blowup}: cpu={cpu_ns}ns gpu={gpu_ns}ns ratio={ratio:.2}x (median of 10)", + ); +} + +#[test] +#[ignore = "informal perf probe; run with --ignored"] +fn bench_lde_batched_vs_rayon_cpu() { + let log_n = 16u32; + let blowup = 4usize; + let n = 1usize << log_n; + let num_cols = 64; + + let mut rng = ChaCha8Rng::seed_from_u64(21); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| rng.r#gen::()).collect()) + .collect(); + let weights = coset_weights(n, 7); + + // Warm up every stream slot so subsequent iterations don't pay the + // one-time pinned staging alloc cost. + let warm_slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); + for _ in 0..64 { + let _ = + math_cuda::lde::coset_lde_batch_base(&warm_slices, blowup, &weights).unwrap(); + } + let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); + let inv_tw = LayerTwiddles::::new_inverse(log_n as u64).unwrap(); + let fwd_tw = LayerTwiddles::::new( + (n * blowup).trailing_zeros() as u64, + ) + .unwrap(); + + // GPU batched — first run may include lazy device init; do a few to stabilise. + let slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); + let mut gpu_ns = u128::MAX; + for _ in 0..5 { + let t0 = Instant::now(); + let _ = math_cuda::lde::coset_lde_batch_base(&slices, blowup, &weights).unwrap(); + gpu_ns = gpu_ns.min(t0.elapsed().as_nanos()); + } + + // CPU rayon (same pattern as prover). + let mut cpu_bufs: Vec> = columns + .iter() + .map(|c| c.iter().map(|&x| Fp::from_raw(x)).collect()) + .collect(); + let t0 = Instant::now(); + cpu_bufs.par_iter_mut().for_each(|buf| { + Polynomial::coset_lde_full_expand::( + buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + ) + .unwrap(); + }); + let cpu_ns = t0.elapsed().as_nanos(); + + let ratio = cpu_ns as f64 / gpu_ns as f64; + println!( + "batched {num_cols} cols, log_n={log_n}, blowup={blowup}: cpu={cpu_ns}ns gpu={gpu_ns}ns ratio={ratio:.2}x (cores={})", + rayon::current_num_threads(), + ); +} + +#[test] +#[ignore = "informal perf probe; run with --ignored"] +fn bench_lde_multi_column_serialized_gpu() { + use std::sync::Mutex; + + let log_n = 16u32; + let blowup = 4usize; + let n = 1usize << log_n; + let num_cols = 64; + + let _ = math_cuda::lde::coset_lde_base( + &vec![0u64; n], + blowup, + &coset_weights(n, 7), + ) + .unwrap(); + + let mut rng = ChaCha8Rng::seed_from_u64(13); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| rng.r#gen::()).collect()) + .collect(); + let weights = coset_weights(n, 7); + + // Single global Mutex so only one thread at a time calls GPU. + let gpu_lock = Mutex::new(()); + let t0 = Instant::now(); + let _: Vec> = columns + .par_iter() + .map(|col| { + let _guard = gpu_lock.lock().unwrap(); + math_cuda::lde::coset_lde_base(col, blowup, &weights).unwrap() + }) + .collect(); + let gpu_ns = t0.elapsed().as_nanos(); + println!("GPU mutex-serialised from rayon: {gpu_ns}ns for {num_cols} cols"); +} + +#[test] +#[ignore = "informal perf probe; run with --ignored"] +fn bench_lde_multi_column_gpu_limited_threads() { + // Same as multi_column_parallel but forces rayon to use only 8 threads + // (matching the GPU stream pool rough capacity). Tests whether oversubscribed + // rayon + many streams is the bottleneck. + let gpu_pool = rayon::ThreadPoolBuilder::new() + .num_threads(8) + .build() + .unwrap(); + + let log_n = 16u32; + let blowup = 4usize; + let n = 1usize << log_n; + let num_cols = 64; + + let _ = math_cuda::lde::coset_lde_base( + &vec![0u64; n], + blowup, + &coset_weights(n, 7), + ) + .unwrap(); + + let mut rng = ChaCha8Rng::seed_from_u64(12); + let columns: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| rng.r#gen::()).collect()) + .collect(); + let weights = coset_weights(n, 7); + + let t0 = Instant::now(); + let _gpu_results: Vec> = gpu_pool.install(|| { + columns + .par_iter() + .map(|col| math_cuda::lde::coset_lde_base(col, blowup, &weights).unwrap()) + .collect() + }); + let gpu_ns = t0.elapsed().as_nanos(); + + let t0 = Instant::now(); + let _serial_gpu_results: Vec> = columns + .iter() + .map(|col| math_cuda::lde::coset_lde_base(col, blowup, &weights).unwrap()) + .collect(); + let gpu_serial_ns = t0.elapsed().as_nanos(); + + println!( + "GPU-only 8-thread: gpu-parallel={gpu_ns}ns gpu-serial={gpu_serial_ns}ns speedup={:.2}x", + gpu_serial_ns as f64 / gpu_ns as f64, + ); +} diff --git a/crypto/math-cuda/tests/evaluate_coset_ext3.rs b/crypto/math-cuda/tests/evaluate_coset_ext3.rs new file mode 100644 index 000000000..a79195291 --- /dev/null +++ b/crypto/math-cuda/tests/evaluate_coset_ext3.rs @@ -0,0 +1,143 @@ +//! Parity test for `evaluate_poly_coset_batch_ext3_into`. +//! +//! Reference: `math::polynomial::Polynomial::evaluate_offset_fft` on an ext3 +//! polynomial, then canonicalise. The GPU path should produce the same +//! evaluations on the offset-coset at `n * blowup` points. + +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use math::polynomial::Polynomial; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +fn offset_weights(n: usize, offset: u64) -> Vec { + let mut w = Vec::with_capacity(n); + let mut cur = 1u64; + for _ in 0..n { + w.push(cur); + cur = GoldilocksField::mul(&cur, &offset); + } + w +} + +fn rand_ext3(rng: &mut ChaCha8Rng) -> Fp3 { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) +} + +fn ext3_to_u64s(col: &[Fp3]) -> Vec { + let mut out = Vec::with_capacity(col.len() * 3); + for e in col { + out.push(*e.value()[0].value()); + out.push(*e.value()[1].value()); + out.push(*e.value()[2].value()); + } + out +} + +fn u64s_to_ext3(raw: &[u64]) -> Vec { + let mut out = Vec::with_capacity(raw.len() / 3); + for i in 0..raw.len() / 3 { + out.push(Fp3::new([ + Fp::from_raw(raw[i * 3 + 0]), + Fp::from_raw(raw[i * 3 + 1]), + Fp::from_raw(raw[i * 3 + 2]), + ])); + } + out +} + +fn canon_fp3(e: &Fp3) -> [u64; 3] { + [ + GoldilocksField::canonical(e.value()[0].value()), + GoldilocksField::canonical(e.value()[1].value()), + GoldilocksField::canonical(e.value()[2].value()), + ] +} + +fn assert_evaluate_coset(log_n: u64, blowup: usize, m: usize, offset: u64, seed: u64) { + let n = 1usize << log_n; + let lde_size = n * blowup; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + + // M ext3 polynomials, each of degree < n. + let polys: Vec> = (0..m) + .map(|_| (0..n).map(|_| rand_ext3(&mut rng)).collect()) + .collect(); + + let weights = offset_weights(n, offset); + + // CPU reference: evaluate each polynomial at `offset`-coset of size lde_size. + let offset_fp = Fp::from_raw(offset); + let cpu: Vec> = polys + .iter() + .map(|coefs| { + let p = Polynomial::new(coefs); + Polynomial::evaluate_offset_fft::( + &p, + blowup, + Some(n), + &offset_fp, + ) + .unwrap() + }) + .collect(); + + // GPU: flatten each poly to 3n u64s, pre-allocate 3*lde_size u64 outputs. + let flat_inputs: Vec> = polys.iter().map(|p| ext3_to_u64s(p)).collect(); + let input_slices: Vec<&[u64]> = flat_inputs.iter().map(|v| v.as_slice()).collect(); + let mut flat_outputs: Vec> = (0..m).map(|_| vec![0u64; 3 * lde_size]).collect(); + { + let mut out_slices: Vec<&mut [u64]> = + flat_outputs.iter_mut().map(|v| v.as_mut_slice()).collect(); + math_cuda::lde::evaluate_poly_coset_batch_ext3_into( + &input_slices, + n, + blowup, + &weights, + &mut out_slices, + ) + .unwrap(); + } + + for c in 0..m { + let gpu: Vec = u64s_to_ext3(&flat_outputs[c]); + assert_eq!(gpu.len(), cpu[c].len(), "length mismatch"); + for i in 0..gpu.len() { + let g = canon_fp3(&gpu[i]); + let cc = canon_fp3(&cpu[c][i]); + assert_eq!(g, cc, "eval mismatch col={c} row={i} log_n={log_n} blowup={blowup}"); + } + } +} + +#[test] +fn ext3_evaluate_coset_small() { + for &m in &[1usize, 4] { + for log_n in 4..=10 { + for &blowup in &[2usize, 4] { + assert_evaluate_coset(log_n, blowup, m, 7, 100 + log_n * 10 + m as u64); + } + } + } +} + +#[test] +fn ext3_evaluate_coset_medium() { + for log_n in 11..=14 { + assert_evaluate_coset(log_n, 4, 2, 7, 200 + log_n); + } +} + +#[test] +fn ext3_evaluate_coset_large_one_column() { + assert_evaluate_coset(16, 4, 1, 7, 0xCAFE); +} diff --git a/crypto/math-cuda/tests/ext3.rs b/crypto/math-cuda/tests/ext3.rs new file mode 100644 index 000000000..c9aabbc27 --- /dev/null +++ b/crypto/math-cuda/tests/ext3.rs @@ -0,0 +1,87 @@ +//! Parity: GPU ext3 arithmetic must agree (canonically) with CPU +//! `Degree3GoldilocksExtensionField` on random ext3 inputs. + +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +const N: usize = 10_000; + +fn random_fp3s(seed: u64, count: usize) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..count) + .map(|_| { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) + }) + .collect() +} + +fn to_u64s(col: &[Fp3]) -> Vec { + let mut v = Vec::with_capacity(col.len() * 3); + for e in col { + v.push(*e.value()[0].value()); + v.push(*e.value()[1].value()); + v.push(*e.value()[2].value()); + } + v +} + +fn canon_triplet(e: &Fp3) -> [u64; 3] { + [ + GoldilocksField::canonical(e.value()[0].value()), + GoldilocksField::canonical(e.value()[1].value()), + GoldilocksField::canonical(e.value()[2].value()), + ] +} + +fn canon_triplet_raw(t: &[u64]) -> [u64; 3] { + [ + GoldilocksField::canonical(&t[0]), + GoldilocksField::canonical(&t[1]), + GoldilocksField::canonical(&t[2]), + ] +} + +#[test] +fn ext3_mul_matches_cpu() { + let a = random_fp3s(11, N); + let b = random_fp3s(22, N); + let a_raw = to_u64s(&a); + let b_raw = to_u64s(&b); + let gpu = math_cuda::ext3_mul_u64(&a_raw, &b_raw).unwrap(); + assert_eq!(gpu.len(), 3 * N); + for i in 0..N { + use math::field::traits::IsField; + let cpu = Degree3GoldilocksExtensionField::mul(a[i].value(), b[i].value()); + let cpu_fp3 = Fp3::new(cpu); + let g = canon_triplet_raw(&gpu[i * 3..(i + 1) * 3]); + let c = canon_triplet(&cpu_fp3); + assert_eq!(g, c, "ext3 mul mismatch at {i}"); + } +} + +#[test] +fn ext3_add_matches_cpu() { + let a = random_fp3s(33, N); + let b = random_fp3s(44, N); + let a_raw = to_u64s(&a); + let b_raw = to_u64s(&b); + let gpu = math_cuda::ext3_add_u64(&a_raw, &b_raw).unwrap(); + for i in 0..N { + let cpu = Degree3GoldilocksExtensionField::add(a[i].value(), b[i].value()); + let cpu_fp3 = Fp3::new(cpu); + let g = canon_triplet_raw(&gpu[i * 3..(i + 1) * 3]); + let c = canon_triplet(&cpu_fp3); + assert_eq!(g, c, "ext3 add mismatch at {i}"); + } +} diff --git a/crypto/math-cuda/tests/goldilocks.rs b/crypto/math-cuda/tests/goldilocks.rs new file mode 100644 index 000000000..317ffb0f8 --- /dev/null +++ b/crypto/math-cuda/tests/goldilocks.rs @@ -0,0 +1,127 @@ +//! GPU must produce bit-identical u64 outputs to `GoldilocksField` for every op. +//! Non-canonical inputs are expected (CPU operates on the full [0, 2^64) range), +//! so the test inputs include values above the prime. + +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +const N: usize = 10_000; + +fn sample_inputs(seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..N).map(|_| rng.r#gen::()).collect() +} + +fn assert_raw_eq(op: &str, expected: &[u64], actual: &[u64]) { + assert_eq!(expected.len(), actual.len()); + for (i, (e, a)) in expected.iter().zip(actual.iter()).enumerate() { + if e != a { + panic!( + "{op} mismatch at {i}: cpu={e:#018x} (canon {:#018x}), gpu={a:#018x} (canon {:#018x})", + GoldilocksField::canonical(e), + GoldilocksField::canonical(a), + ); + } + } +} + +#[test] +fn gpu_vector_add_u64_matches_wrapping() { + let a = sample_inputs(0xC0FFEE); + let b = sample_inputs(0xDEADBEEF); + let expected: Vec = a.iter().zip(&b).map(|(x, y)| x.wrapping_add(*y)).collect(); + let actual = math_cuda::vector_add_u64(&a, &b).expect("GPU vector_add_u64"); + assert_raw_eq("vector_add (wrapping)", &expected, &actual); +} + +#[test] +fn gpu_gl_add_matches_cpu() { + let a = sample_inputs(1); + let b = sample_inputs(2); + let expected: Vec = a + .iter() + .zip(&b) + .map(|(x, y)| GoldilocksField::add(x, y)) + .collect(); + let actual = math_cuda::gl_add_u64(&a, &b).expect("GPU gl_add"); + assert_raw_eq("gl_add", &expected, &actual); +} + +#[test] +fn gpu_gl_sub_matches_cpu() { + let a = sample_inputs(3); + let b = sample_inputs(4); + let expected: Vec = a + .iter() + .zip(&b) + .map(|(x, y)| GoldilocksField::sub(x, y)) + .collect(); + let actual = math_cuda::gl_sub_u64(&a, &b).expect("GPU gl_sub"); + assert_raw_eq("gl_sub", &expected, &actual); +} + +#[test] +fn gpu_gl_mul_matches_cpu() { + let a = sample_inputs(5); + let b = sample_inputs(6); + let expected: Vec = a + .iter() + .zip(&b) + .map(|(x, y)| GoldilocksField::mul(x, y)) + .collect(); + let actual = math_cuda::gl_mul_u64(&a, &b).expect("GPU gl_mul"); + assert_raw_eq("gl_mul", &expected, &actual); +} + +#[test] +fn gpu_gl_neg_matches_cpu() { + let a = sample_inputs(7); + let expected: Vec = a.iter().map(|x| GoldilocksField::neg(x)).collect(); + let actual = math_cuda::gl_neg_u64(&a).expect("GPU gl_neg"); + assert_raw_eq("gl_neg", &expected, &actual); +} + +/// Edge cases the random generator is unlikely to hit: 0, 1, p-1, p, p+1, 2p-1, +/// u64::MAX, EPSILON boundary values. Covers double-overflow / double-underflow. +#[test] +fn gpu_goldilocks_edge_cases() { + const P: u64 = 0xFFFF_FFFF_0000_0001; + const EPS: u64 = 0xFFFF_FFFF; + let edge: [u64; 11] = [ + 0, + 1, + P - 1, + P, + P + 1, + 2u64.wrapping_mul(P).wrapping_sub(1), + u64::MAX, + u64::MAX - EPS, + u64::MAX - 1, + EPS, + EPS - 1, + ]; + // All pairs via nested loops, materialised as flat a[], b[] of length edge^2. + let mut a = Vec::with_capacity(edge.len() * edge.len()); + let mut b = Vec::with_capacity(edge.len() * edge.len()); + for &x in &edge { + for &y in &edge { + a.push(x); + b.push(y); + } + } + + let cases: &[(&str, fn(&[u64], &[u64]) -> math_cuda::Result>, fn(&u64, &u64) -> u64)] = + &[ + ("gl_add", math_cuda::gl_add_u64, GoldilocksField::add), + ("gl_sub", math_cuda::gl_sub_u64, GoldilocksField::sub), + ("gl_mul", math_cuda::gl_mul_u64, GoldilocksField::mul), + ]; + + for (op, gpu_fn, cpu_fn) in cases { + let expected: Vec = a.iter().zip(&b).map(|(x, y)| cpu_fn(x, y)).collect(); + let actual = gpu_fn(&a, &b).expect("GPU op"); + assert_raw_eq(op, &expected, &actual); + } +} diff --git a/crypto/math-cuda/tests/lde.rs b/crypto/math-cuda/tests/lde.rs new file mode 100644 index 000000000..9648f833a --- /dev/null +++ b/crypto/math-cuda/tests/lde.rs @@ -0,0 +1,112 @@ +//! Phase-5 parity: GPU `coset_lde_base` must match the CPU +//! `Polynomial::coset_lde_full_expand` for a sweep of realistic sizes and +//! blowup factors. + +use math::fft::cpu::bowers_fft::LayerTwiddles; +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use math::polynomial::Polynomial; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; + +/// Build the coset weights `[1/N, g/N, g²/N, …, g^{n-1}/N]` — this is the +/// layout `crypto/stark/src/prover.rs:248` uses, with `1/N` pre-folded into the +/// first coefficient so the iFFT step does not need a separate scaling pass. +fn coset_weights(n: usize, coset_offset: u64) -> Vec { + let inv_n_fe = FieldElement::::from(n as u64) + .inv() + .expect("n is non-zero"); + let mut w = Vec::with_capacity(n); + let mut cur = *inv_n_fe.value(); + for _ in 0..n { + w.push(cur); + cur = GoldilocksField::mul(&cur, &coset_offset); + } + w +} + +fn cpu_lde(evals: &[u64], blowup_factor: usize, coset_offset: u64) -> Vec { + let n = evals.len(); + let log_n = n.trailing_zeros() as u64; + let log_lde = (n * blowup_factor).trailing_zeros() as u64; + + let inv_tw = LayerTwiddles::::new_inverse(log_n).expect("inv tw"); + let fwd_tw = LayerTwiddles::::new(log_lde).expect("fwd tw"); + let weights_raw = coset_weights(n, coset_offset); + let weights: Vec = weights_raw.iter().map(|&w| Fp::from_raw(w)).collect(); + + let mut buf: Vec = evals.iter().map(|&x| Fp::from_raw(x)).collect(); + Polynomial::coset_lde_full_expand::( + &mut buf, + blowup_factor, + &weights, + &inv_tw, + &fwd_tw, + ) + .expect("cpu lde"); + + buf.into_iter().map(|e| *e.value()).collect() +} + +fn canon(xs: &[u64]) -> Vec { + xs.iter().map(|x| GoldilocksField::canonical(x)).collect() +} + +fn assert_lde_match(log_n: u64, blowup_factor: usize, seed: u64) { + let n = 1usize << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let evals: Vec = (0..n).map(|_| rng.r#gen::()).collect(); + + // Use a fixed, public coset offset. For lambda-vm the coset offset is the + // generator of Goldilocks' multiplicative subgroup; any non-trivial element + // works for an isolated correctness check. + let coset_offset: u64 = 7; + let weights = coset_weights(n, coset_offset); + + let cpu = cpu_lde(&evals, blowup_factor, coset_offset); + let gpu = math_cuda::lde::coset_lde_base(&evals, blowup_factor, &weights).expect("gpu lde"); + + assert_eq!(cpu.len(), gpu.len(), "length mismatch (log_n={log_n}, blowup={blowup_factor})"); + let cpu_c = canon(&cpu); + let gpu_c = canon(&gpu); + for (i, (e, a)) in cpu_c.iter().zip(&gpu_c).enumerate() { + if e != a { + panic!( + "lde mismatch log_n={log_n} blowup={blowup_factor} i={i}: cpu {e:#018x}, gpu {a:#018x}", + ); + } + } +} + +#[test] +fn lde_small() { + for log_n in 4..=10 { + for &blow in &[2usize, 4, 8] { + assert_lde_match(log_n, blow, 1_000 + log_n + (blow as u64)); + } + } +} + +#[test] +fn lde_medium() { + for log_n in 11..=14 { + for &blow in &[2usize, 4] { + assert_lde_match(log_n, blow, 2_000 + log_n + (blow as u64)); + } + } +} + +#[test] +fn lde_large_2_to_18() { + // 2^18 × blowup 4 = 2^20 LDE — representative of Phase A trace columns. + assert_lde_match(18, 4, 0xCAFE); +} + +#[test] +fn lde_largest_2_to_20() { + // 2^20 LDE is the hot size; blowup 2 keeps total = 2^21 (within TWO_ADICITY). + assert_lde_match(20, 2, 0xF00D); +} diff --git a/crypto/math-cuda/tests/lde_batch.rs b/crypto/math-cuda/tests/lde_batch.rs new file mode 100644 index 000000000..67f975728 --- /dev/null +++ b/crypto/math-cuda/tests/lde_batch.rs @@ -0,0 +1,96 @@ +//! Batched coset LDE must agree with running the CPU single-column LDE on +//! each column independently. Sweeps a few realistic (n, blowup, m) tuples. + +use math::fft::cpu::bowers_fft::LayerTwiddles; +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use math::polynomial::Polynomial; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; + +fn coset_weights(n: usize, g: u64) -> Vec { + let inv_n = *FieldElement::::from(n as u64) + .inv() + .unwrap() + .value(); + let mut w = Vec::with_capacity(n); + let mut cur = inv_n; + for _ in 0..n { + w.push(cur); + cur = GoldilocksField::mul(&cur, &g); + } + w +} + +fn cpu_lde_one(col: &[u64], blowup: usize, weights_fp: &[Fp], inv_tw: &LayerTwiddles, fwd_tw: &LayerTwiddles) -> Vec { + let mut buf: Vec = col.iter().map(|&x| Fp::from_raw(x)).collect(); + Polynomial::coset_lde_full_expand::( + &mut buf, blowup, weights_fp, inv_tw, fwd_tw, + ) + .unwrap(); + buf.into_iter().map(|e| *e.value()).collect() +} + +fn canon(xs: &[u64]) -> Vec { + xs.iter().map(|x| GoldilocksField::canonical(x)).collect() +} + +fn assert_batch(log_n: u64, blowup: usize, m: usize, seed: u64) { + let n = 1usize << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let columns: Vec> = (0..m) + .map(|_| (0..n).map(|_| rng.r#gen::()).collect()) + .collect(); + + let coset_offset: u64 = 7; + let weights = coset_weights(n, coset_offset); + let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); + + let inv_tw = LayerTwiddles::::new_inverse(log_n).unwrap(); + let fwd_tw = + LayerTwiddles::::new((n * blowup).trailing_zeros() as u64).unwrap(); + + let slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); + let gpu_all = math_cuda::lde::coset_lde_batch_base(&slices, blowup, &weights).unwrap(); + assert_eq!(gpu_all.len(), m); + + for (c, col) in columns.iter().enumerate() { + let cpu = cpu_lde_one(col, blowup, &weights_fp, &inv_tw, &fwd_tw); + assert_eq!( + canon(&gpu_all[c]), + canon(&cpu), + "batch mismatch at col {c}, log_n={log_n}, blowup={blowup}" + ); + } +} + +#[test] +fn batch_small() { + for &m in &[1usize, 4, 16] { + for log_n in 4..=10 { + assert_batch(log_n, 4, m, 100 + log_n * 10 + m as u64); + } + } +} + +#[test] +fn batch_medium() { + for &m in &[2usize, 32] { + for log_n in 11..=14 { + assert_batch(log_n, 4, m, 200 + log_n * 10 + m as u64); + } + } +} + +#[test] +fn batch_large_one_column() { + assert_batch(18, 4, 1, 0xCAFE); +} + +#[test] +fn batch_large_32_columns() { + assert_batch(15, 4, 32, 0xBEEF); +} diff --git a/crypto/math-cuda/tests/lde_batch_ext3.rs b/crypto/math-cuda/tests/lde_batch_ext3.rs new file mode 100644 index 000000000..0a86197a5 --- /dev/null +++ b/crypto/math-cuda/tests/lde_batch_ext3.rs @@ -0,0 +1,161 @@ +//! Ext3 batched coset LDE must agree with the CPU `coset_lde_full_expand` +//! on each column independently when run over `FieldElement`. + +use math::fft::cpu::bowers_fft::LayerTwiddles; +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use math::polynomial::Polynomial; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +fn coset_weights(n: usize, g: u64) -> Vec { + let inv_n = *FieldElement::::from(n as u64) + .inv() + .unwrap() + .value(); + let mut w = Vec::with_capacity(n); + let mut cur = inv_n; + for _ in 0..n { + w.push(cur); + cur = GoldilocksField::mul(&cur, &g); + } + w +} + +fn rand_ext3(rng: &mut ChaCha8Rng) -> Fp3 { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) +} + +fn ext3_to_u64s(col: &[Fp3]) -> Vec { + // Each Fp3 is [u64; 3] in memory; we just flatten componentwise. + let mut out = Vec::with_capacity(col.len() * 3); + for e in col { + out.push(*e.value()[0].value()); + out.push(*e.value()[1].value()); + out.push(*e.value()[2].value()); + } + out +} + +fn u64s_to_ext3(raw: &[u64]) -> Vec { + assert_eq!(raw.len() % 3, 0); + let mut out = Vec::with_capacity(raw.len() / 3); + for i in 0..raw.len() / 3 { + out.push(Fp3::new([ + Fp::from_raw(raw[i * 3 + 0]), + Fp::from_raw(raw[i * 3 + 1]), + Fp::from_raw(raw[i * 3 + 2]), + ])); + } + out +} + +fn cpu_lde_one_ext3( + col: &[Fp3], + blowup: usize, + weights_fp: &[Fp], + inv_tw: &LayerTwiddles, + fwd_tw: &LayerTwiddles, +) -> Vec { + let mut buf = col.to_vec(); + Polynomial::coset_lde_full_expand::( + &mut buf, blowup, weights_fp, inv_tw, fwd_tw, + ) + .unwrap(); + buf +} + +fn canon(xs: &[u64]) -> Vec { + xs.iter().map(|x| GoldilocksField::canonical(x)).collect() +} + +fn assert_ext3_batch(log_n: u64, blowup: usize, m: usize, seed: u64) { + let n = 1usize << log_n; + let lde_size = n * blowup; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let columns: Vec> = (0..m) + .map(|_| (0..n).map(|_| rand_ext3(&mut rng)).collect()) + .collect(); + + let coset_offset: u64 = 7; + let weights = coset_weights(n, coset_offset); + let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); + let inv_tw = LayerTwiddles::::new_inverse(log_n).unwrap(); + let fwd_tw = LayerTwiddles::::new(lde_size.trailing_zeros() as u64).unwrap(); + + // Flatten each ext3 column to 3n u64s for the GPU API. + let flat_inputs: Vec> = columns.iter().map(|c| ext3_to_u64s(c)).collect(); + let input_slices: Vec<&[u64]> = flat_inputs.iter().map(|v| v.as_slice()).collect(); + + // Pre-allocate outputs, each 3*lde_size u64s. + let mut flat_outputs: Vec> = + (0..m).map(|_| vec![0u64; 3 * lde_size]).collect(); + { + let mut out_slices: Vec<&mut [u64]> = + flat_outputs.iter_mut().map(|v| v.as_mut_slice()).collect(); + math_cuda::lde::coset_lde_batch_ext3_into( + &input_slices, + n, + blowup, + &weights, + &mut out_slices, + ) + .unwrap(); + } + + for (c, col) in columns.iter().enumerate() { + let cpu = cpu_lde_one_ext3(col, blowup, &weights_fp, &inv_tw, &fwd_tw); + let gpu: Vec = u64s_to_ext3(&flat_outputs[c]); + assert_eq!(gpu.len(), cpu.len(), "length mismatch"); + for i in 0..cpu.len() { + for k in 0..3 { + let cv = *cpu[i].value()[k].value(); + let gv = *gpu[i].value()[k].value(); + let cc = GoldilocksField::canonical(&cv); + let gc = GoldilocksField::canonical(&gv); + if cc != gc { + panic!( + "ext3 batch mismatch col={c} row={i} comp={k} log_n={log_n} blowup={blowup}: cpu={cv:#018x} (canon {cc:#018x}), gpu={gv:#018x} (canon {gc:#018x})", + ); + } + } + } + } + // Also sanity-check raw canonical equality per column. + for (c, col) in columns.iter().enumerate() { + let cpu_raw = ext3_to_u64s(&cpu_lde_one_ext3(col, blowup, &weights_fp, &inv_tw, &fwd_tw)); + assert_eq!(canon(&cpu_raw), canon(&flat_outputs[c])); + } +} + +#[test] +fn ext3_batch_small() { + for &m in &[1usize, 4, 16] { + for log_n in 4..=10 { + assert_ext3_batch(log_n, 4, m, 100 + log_n * 10 + m as u64); + } + } +} + +#[test] +fn ext3_batch_medium() { + for &m in &[2usize, 8] { + for log_n in 11..=14 { + assert_ext3_batch(log_n, 4, m, 300 + log_n * 10 + m as u64); + } + } +} + +#[test] +fn ext3_batch_large_one_column() { + assert_ext3_batch(16, 4, 1, 0xCAFE); +} diff --git a/crypto/math-cuda/tests/ntt.rs b/crypto/math-cuda/tests/ntt.rs new file mode 100644 index 000000000..d7cf3680a --- /dev/null +++ b/crypto/math-cuda/tests/ntt.rs @@ -0,0 +1,136 @@ +//! Phase-3 parity: GPU forward NTT must agree with `Polynomial::evaluate_fft` +//! as a field element, across a sweep of sizes from 2^4 to 2^20. +//! +//! Non-canonical u64s can differ between CPU and GPU while representing the +//! same element; we canonicalise both sides before comparing. + +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::IsPrimeField; +use math::polynomial::Polynomial; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; + +fn cpu_fft(coeffs: &[u64]) -> Vec { + let elems: Vec = coeffs.iter().map(|&x| Fp::from_raw(x)).collect(); + let poly = Polynomial::new(&elems); + let evals = Polynomial::evaluate_fft::(&poly, 1, None).expect("cpu fft"); + evals.into_iter().map(|e| *e.value()).collect() +} + +fn canonicalize(xs: &[u64]) -> Vec { + xs.iter() + .map(|x| GoldilocksField::canonical(x)) + .collect() +} + +fn assert_ntt_match(log_n: u64, seed: u64) { + let n = 1usize << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let input: Vec = (0..n).map(|_| rng.r#gen::()).collect(); + + let cpu = cpu_fft(&input); + let gpu = math_cuda::ntt::forward(&input).expect("gpu ntt"); + + assert_eq!(cpu.len(), gpu.len(), "length mismatch at log_n = {log_n}"); + let cpu_c = canonicalize(&cpu); + let gpu_c = canonicalize(&gpu); + for i in 0..n { + if cpu_c[i] != gpu_c[i] { + panic!( + "log_n={log_n} i={i}: cpu={:#018x} (canon {:#018x}), gpu={:#018x} (canon {:#018x})", + cpu[i], cpu_c[i], gpu[i], gpu_c[i], + ); + } + } +} + +#[test] +fn ntt_sizes_small() { + for log_n in 4..=10 { + assert_ntt_match(log_n, 100 + log_n); + } +} + +#[test] +fn ntt_sizes_medium() { + for log_n in 11..=16 { + assert_ntt_match(log_n, 200 + log_n); + } +} + +#[test] +fn ntt_size_2_to_20() { + // The hot LDE size. One seed is enough; any mismatch screams loudly. + assert_ntt_match(20, 0xDEAD); +} + +#[test] +fn ntt_trivial_sizes() { + // Power-of-two below the interesting range — should still pass. + assert_ntt_match(1, 1); + assert_ntt_match(2, 2); + assert_ntt_match(3, 3); +} + +fn assert_intt_match(log_n: u64, seed: u64) { + let n = 1usize << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let evals: Vec = (0..n).map(|_| rng.r#gen::()).collect(); + + let elems: Vec = evals.iter().map(|&x| Fp::from_raw(x)).collect(); + let cpu_poly = + Polynomial::interpolate_fft::(&elems).expect("cpu intt"); + let cpu: Vec = cpu_poly.coefficients.into_iter().map(|e| *e.value()).collect(); + + let gpu = math_cuda::ntt::inverse(&evals).expect("gpu intt"); + + let cpu_c = canonicalize(&cpu); + let gpu_c = canonicalize(&gpu); + for i in 0..n { + if cpu_c[i] != gpu_c[i] { + panic!( + "iNTT log_n={log_n} i={i}: cpu canon {:#018x}, gpu canon {:#018x}", + cpu_c[i], gpu_c[i], + ); + } + } +} + +#[test] +fn intt_sizes_small() { + for log_n in 4..=10 { + assert_intt_match(log_n, 700 + log_n); + } +} + +#[test] +fn intt_sizes_medium() { + for log_n in 11..=16 { + assert_intt_match(log_n, 800 + log_n); + } +} + +#[test] +fn intt_size_2_to_20() { + assert_intt_match(20, 0xBEEF); +} + +#[test] +fn ntt_round_trip() { + // inverse(forward(x)) == x up to canonical form. + let log_n = 14; + let n = 1usize << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(42); + let x: Vec = (0..n).map(|_| rng.r#gen::() % 0xFFFF_FFFF_0000_0001).collect(); + + let evals = math_cuda::ntt::forward(&x).expect("forward"); + let back = math_cuda::ntt::inverse(&evals).expect("inverse"); + + let x_c = canonicalize(&x); + let back_c = canonicalize(&back); + assert_eq!(x_c, back_c, "round trip failed"); +} + diff --git a/crypto/stark/Cargo.toml b/crypto/stark/Cargo.toml index 53b205996..4d1f2cbca 100644 --- a/crypto/stark/Cargo.toml +++ b/crypto/stark/Cargo.toml @@ -22,6 +22,9 @@ itertools = "0.11.0" # Parallelization crates rayon = { version = "1.8.0", optional = true } +# GPU backend for trace LDE — only linked when `cuda` is enabled. +math-cuda = { path = "../math-cuda", optional = true } + # wasm wasm-bindgen = { version = "0.2", optional = true } serde-wasm-bindgen = { version = "0.5", optional = true } @@ -39,6 +42,7 @@ test_fiat_shamir = [] instruments = [] # This enables timing prints in prover and verifier debug-checks = [] # Enables validate_trace + bus balance report in prover parallel = ["dep:rayon", "crypto/parallel"] +cuda = ["dep:math-cuda"] wasm = ["dep:wasm-bindgen", "dep:serde-wasm-bindgen", "dep:web-sys"] From 79634ff2ef8e956de53326870acec43c30a81a87 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Wed, 6 May 2026 15:26:58 -0300 Subject: [PATCH 2/7] fmt --- crypto/math-cuda/build.rs | 9 +- crypto/math-cuda/src/device.rs | 6 +- crypto/math-cuda/src/lde.rs | 151 +++++++++--------- crypto/math-cuda/src/ntt.rs | 8 +- crypto/math-cuda/tests/bench_quick.rs | 68 ++++---- crypto/math-cuda/tests/evaluate_coset_ext3.rs | 14 +- crypto/math-cuda/tests/goldilocks.rs | 15 +- crypto/math-cuda/tests/lde.rs | 6 +- crypto/math-cuda/tests/lde_batch.rs | 8 +- crypto/math-cuda/tests/lde_batch_ext3.rs | 11 +- crypto/math-cuda/tests/ntt.rs | 18 ++- 11 files changed, 159 insertions(+), 155 deletions(-) diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs index a6defb5ab..d6b803bed 100644 --- a/crypto/math-cuda/build.rs +++ b/crypto/math-cuda/build.rs @@ -30,14 +30,7 @@ fn compile_ptx(src: &str, out_name: &str) { let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| "compute_89".to_string()); let status = Command::new(nvcc_path()) - .args([ - "--ptx", - "-O3", - "-std=c++17", - "-arch", - &arch, - "-o", - ]) + .args(["--ptx", "-O3", "-std=c++17", "-arch", &arch, "-o"]) .arg(&out_path) .arg(&src_path) .status() diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 2c70716a6..03f0b67ca 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -40,11 +40,7 @@ impl PinnedStaging { } } - pub fn ensure_capacity( - &mut self, - min_elems: usize, - ctx: &CudaContext, - ) -> Result<()> { + pub fn ensure_capacity(&mut self, min_elems: usize, ctx: &CudaContext) -> Result<()> { if self.capacity_elems >= min_elems { return Ok(()); } diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index fd25d1bca..8b9da522c 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -38,15 +38,14 @@ pub struct GpuLdeExt3 { pub lde_size: usize, } -pub fn coset_lde_base( - evals: &[u64], - blowup_factor: usize, - weights: &[u64], -) -> Result> { +pub fn coset_lde_base(evals: &[u64], blowup_factor: usize, weights: &[u64]) -> Result> { let n = evals.len(); assert!(n.is_power_of_two(), "evals length must be a power of two"); assert_eq!(weights.len(), n, "weights length must match evals"); - assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + assert!( + blowup_factor.is_power_of_two(), + "blowup must be power of two" + ); if n == 0 { return Ok(Vec::new()); } @@ -130,7 +129,10 @@ pub fn coset_lde_batch_base( let n = columns[0].len(); assert!(n.is_power_of_two(), "column length must be a power of two"); assert_eq!(weights.len(), n, "weights length must match column length"); - assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + assert!( + blowup_factor.is_power_of_two(), + "blowup must be power of two" + ); for c in columns.iter() { assert_eq!(c.len(), n, "all columns must be the same size"); } @@ -147,7 +149,11 @@ pub fn coset_lde_batch_base( let staging_slot = be.pinned_staging(); let debug_phases = std::env::var("MATH_CUDA_PHASE_TIMING").is_ok(); - let t_start = if debug_phases { Some(std::time::Instant::now()) } else { None }; + let t_start = if debug_phases { + Some(std::time::Instant::now()) + } else { + None + }; let phase = |label: &str, prev: &mut Option| { if let Some(p) = prev.as_ref() { let now = std::time::Instant::now(); @@ -165,7 +171,9 @@ pub fn coset_lde_batch_base( staging.ensure_capacity(m * lde_size, &be.ctx)?; // SAFETY: staging is locked, the slice alias ends before we unlock. let pinned = unsafe { staging.as_mut_slice(m * lde_size) }; - if debug_phases { phase("staging lock + grow", &mut last); } + if debug_phases { + phase("staging lock + grow", &mut last); + } // Pack columns into first m*n slots of the pinned buffer. Parallel: pinned // writes are DRAM-bandwidth bound, saturates at ~8 cores on modern @@ -176,32 +184,39 @@ pub fn coset_lde_batch_base( // SAFETY: each task writes to a disjoint `[c*n..c*n+n]` region of // `pinned`, and the outer `staging` lock guarantees no other call is // using the buffer concurrently. - let dst = unsafe { - std::slice::from_raw_parts_mut( - (pinned_base_ptr as *mut u64).add(c * n), - n, - ) - }; + let dst = + unsafe { std::slice::from_raw_parts_mut((pinned_base_ptr as *mut u64).add(c * n), n) }; dst.copy_from_slice(col); }); - if debug_phases { phase("host pack (pinned, rayon)", &mut last); } + if debug_phases { + phase("host pack (pinned, rayon)", &mut last); + } // Column layout: `buf[c * lde_size + r]`. Zeroed so the [n, lde_size) // tail of each column is already the zero-pad the CPU path does. let mut buf = stream.alloc_zeros::(m * lde_size)?; - if debug_phases { stream.synchronize()?; phase("alloc_zeros", &mut last); } + if debug_phases { + stream.synchronize()?; + phase("alloc_zeros", &mut last); + } // One memcpy per column from the pinned buffer into the strided slots. // The pinned source hits PCIe line-rate. for c in 0..m { let mut dst = buf.slice_mut(c * lde_size..c * lde_size + n); stream.memcpy_htod(&pinned[c * n..c * n + n], &mut dst)?; } - if debug_phases { stream.synchronize()?; phase("H2D cols (pinned)", &mut last); } + if debug_phases { + stream.synchronize()?; + phase("H2D cols (pinned)", &mut last); + } let inv_tw = be.inv_twiddles_for(log_n)?; let fwd_tw = be.fwd_twiddles_for(log_lde)?; let weights_dev = stream.clone_htod(weights)?; - if debug_phases { stream.synchronize()?; phase("twiddles + weights", &mut last); } + if debug_phases { + stream.synchronize()?; + phase("twiddles + weights", &mut last); + } let n_u64 = n as u64; let lde_u64 = lde_size as u64; @@ -227,7 +242,10 @@ pub fn coset_lde_batch_base( } } - if debug_phases { stream.synchronize()?; phase("bit_reverse N", &mut last); } + if debug_phases { + stream.synchronize()?; + phase("bit_reverse N", &mut last); + } // === 2. iNTT body over all columns === run_batched_ntt_body( stream.as_ref(), @@ -238,7 +256,10 @@ pub fn coset_lde_batch_base( col_stride_u64, m_u32, )?; - if debug_phases { stream.synchronize()?; phase("iNTT body", &mut last); } + if debug_phases { + stream.synchronize()?; + phase("iNTT body", &mut last); + } // === 3. Pointwise multiply by coset weights (includes 1/N) === { @@ -278,7 +299,10 @@ pub fn coset_lde_batch_base( } } - if debug_phases { stream.synchronize()?; phase("pointwise + bit_reverse LDE", &mut last); } + if debug_phases { + stream.synchronize()?; + phase("pointwise + bit_reverse LDE", &mut last); + } // === 5. Forward NTT on full LDE of every column === run_batched_ntt_body( stream.as_ref(), @@ -289,13 +313,18 @@ pub fn coset_lde_batch_base( col_stride_u64, m_u32, )?; - if debug_phases { stream.synchronize()?; phase("forward NTT body", &mut last); } + if debug_phases { + stream.synchronize()?; + phase("forward NTT body", &mut last); + } // Single big D2H into the reusable pinned staging buffer — pinned, one // call to the driver, saturates PCIe. stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?; stream.synchronize()?; - if debug_phases { phase("D2H (one shot into pinned)", &mut last); } + if debug_phases { + phase("D2H (one shot into pinned)", &mut last); + } // Split pinned → per-column Vecs. The first write to each virgin // Vec page-faults, which can dominate total time (~75 ms for 128 MB). @@ -308,16 +337,15 @@ pub fn coset_lde_batch_base( let mut v = Vec::::with_capacity(lde_size); unsafe { v.set_len(lde_size) }; let src = unsafe { - std::slice::from_raw_parts( - (pinned_ptr as *const u64).add(c * lde_size), - lde_size, - ) + std::slice::from_raw_parts((pinned_ptr as *const u64).add(c * lde_size), lde_size) }; v.copy_from_slice(src); v }) .collect(); - if debug_phases { phase("copy out (rayon pinned → Vecs)", &mut last); } + if debug_phases { + phase("copy out (rayon pinned → Vecs)", &mut last); + } drop(staging); Ok(out) } @@ -341,7 +369,10 @@ pub fn coset_lde_batch_base_into( let n = columns[0].len(); assert!(n.is_power_of_two(), "column length must be a power of two"); assert_eq!(weights.len(), n, "weights length must match column length"); - assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + assert!( + blowup_factor.is_power_of_two(), + "blowup must be power of two" + ); for c in columns.iter() { assert_eq!(c.len(), n, "all columns must be the same size"); } @@ -461,18 +492,12 @@ pub fn coset_lde_batch_base_into( #[allow(unused_imports)] use rayon::prelude::*; let pinned_ptr = pinned.as_ptr() as usize; - outputs - .par_iter_mut() - .enumerate() - .for_each(|(c, dst)| { - let src = unsafe { - std::slice::from_raw_parts( - (pinned_ptr as *const u64).add(c * lde_size), - lde_size, - ) - }; - dst.copy_from_slice(src); - }); + outputs.par_iter_mut().enumerate().for_each(|(c, dst)| { + let src = unsafe { + std::slice::from_raw_parts((pinned_ptr as *const u64).add(c * lde_size), lde_size) + }; + dst.copy_from_slice(src); + }); drop(staging); Ok(()) } @@ -497,15 +522,8 @@ pub fn evaluate_poly_coset_batch_ext3_into( weights: &[u64], outputs: &mut [&mut [u64]], ) -> Result<()> { - evaluate_poly_coset_batch_ext3_into_inner( - coefs, - n, - blowup_factor, - weights, - outputs, - false, - ) - .map(|_| ()) + evaluate_poly_coset_batch_ext3_into_inner(coefs, n, blowup_factor, weights, outputs, false) + .map(|_| ()) } /// Same as [`evaluate_poly_coset_batch_ext3_into`] but retains the de- @@ -519,14 +537,8 @@ pub fn evaluate_poly_coset_batch_ext3_into_keep( weights: &[u64], outputs: &mut [&mut [u64]], ) -> Result { - let opt = evaluate_poly_coset_batch_ext3_into_inner( - coefs, - n, - blowup_factor, - weights, - outputs, - true, - )?; + let opt = + evaluate_poly_coset_batch_ext3_into_inner(coefs, n, blowup_factor, weights, outputs, true)?; Ok(opt.expect("keep_device_buf=true must return Some")) } @@ -724,7 +736,10 @@ pub fn coset_lde_batch_ext3_into( assert_eq!(outputs.len(), m, "outputs must match columns count"); assert!(n.is_power_of_two(), "n must be a power of two"); assert_eq!(weights.len(), n, "weights length must match n"); - assert!(blowup_factor.is_power_of_two(), "blowup must be power of two"); + assert!( + blowup_factor.is_power_of_two(), + "blowup must be power of two" + ); for c in columns.iter() { assert_eq!(c.len(), 3 * n, "each ext3 column must be 3*n u64s"); } @@ -757,22 +772,13 @@ pub fn coset_lde_batch_ext3_into( columns.par_iter().enumerate().for_each(|(c, col)| { // SAFETY: disjoint regions per c; staging lock held. let slab_a = unsafe { - std::slice::from_raw_parts_mut( - (pinned_ptr_u as *mut u64).add((c * 3 + 0) * n), - n, - ) + std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 0) * n), n) }; let slab_b = unsafe { - std::slice::from_raw_parts_mut( - (pinned_ptr_u as *mut u64).add((c * 3 + 1) * n), - n, - ) + std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 1) * n), n) }; let slab_c = unsafe { - std::slice::from_raw_parts_mut( - (pinned_ptr_u as *mut u64).add((c * 3 + 2) * n), - n, - ) + std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 2) * n), n) }; for i in 0..n { slab_a[i] = col[i * 3 + 0]; @@ -981,4 +987,3 @@ fn run_batched_ntt_body( } Ok(()) } - diff --git a/crypto/math-cuda/src/ntt.rs b/crypto/math-cuda/src/ntt.rs index 0ebb015ea..31333c3ae 100644 --- a/crypto/math-cuda/src/ntt.rs +++ b/crypto/math-cuda/src/ntt.rs @@ -87,13 +87,7 @@ fn ntt_inplace(input: &[u64], forward: bool) -> Result> { // 2. DIT butterfly levels. For log_n >= 8 we fuse 8 levels per kernel via // the shmem kernel; for very small sizes (< 256 elements) we stick with // the per-level kernel because the shmem block dimensions assume n ≥ 256. - run_ntt_body( - stream.as_ref(), - &mut x_dev, - tw_dev.as_ref(), - n_u64, - log_n, - )?; + run_ntt_body(stream.as_ref(), &mut x_dev, tw_dev.as_ref(), n_u64, log_n)?; // 3. For iNTT, multiply by 1/n. if !forward { diff --git a/crypto/math-cuda/tests/bench_quick.rs b/crypto/math-cuda/tests/bench_quick.rs index 561331b74..41e9444c3 100644 --- a/crypto/math-cuda/tests/bench_quick.rs +++ b/crypto/math-cuda/tests/bench_quick.rs @@ -15,7 +15,10 @@ use rayon::prelude::*; type Fp = FieldElement; fn coset_weights(n: usize, g: u64) -> Vec { - let inv_n = *FieldElement::::from(n as u64).inv().unwrap().value(); + let inv_n = *FieldElement::::from(n as u64) + .inv() + .unwrap() + .value(); let mut w = Vec::with_capacity(n); let mut cur = inv_n; for _ in 0..n { @@ -54,7 +57,11 @@ fn bench_lde_2_to_18_blowup_4() { for _ in 0..TRIALS { let mut buf: Vec = input.iter().map(|&x| Fp::from_raw(x)).collect(); Polynomial::coset_lde_full_expand::( - &mut buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + &mut buf, + blowup, + &weights_fp, + &inv_tw, + &fwd_tw, ) .unwrap(); std::hint::black_box(&buf); @@ -101,12 +108,7 @@ fn bench_lde_multi_column_parallel() { let num_cols = 64; // Warm up. - let _ = math_cuda::lde::coset_lde_base( - &vec![0u64; n], - blowup, - &coset_weights(n, 7), - ) - .unwrap(); + let _ = math_cuda::lde::coset_lde_base(&vec![0u64; n], blowup, &coset_weights(n, 7)).unwrap(); // Build input data. let mut rng = ChaCha8Rng::seed_from_u64(11); @@ -134,7 +136,11 @@ fn bench_lde_multi_column_parallel() { let t0 = Instant::now(); cpu_bufs.par_iter_mut().for_each(|buf| { Polynomial::coset_lde_full_expand::( - buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + buf, + blowup, + &weights_fp, + &inv_tw, + &fwd_tw, ) .unwrap(); }); @@ -164,15 +170,12 @@ fn bench_lde_batched_prover_scale() { let weights = coset_weights(n, 7); let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); let inv_tw = LayerTwiddles::::new_inverse(log_n as u64).unwrap(); - let fwd_tw = LayerTwiddles::::new( - (n * blowup).trailing_zeros() as u64, - ) - .unwrap(); + let fwd_tw = + LayerTwiddles::::new((n * blowup).trailing_zeros() as u64).unwrap(); let warm_slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); for _ in 0..8 { - let _ = - math_cuda::lde::coset_lde_batch_base(&warm_slices, blowup, &weights).unwrap(); + let _ = math_cuda::lde::coset_lde_batch_base(&warm_slices, blowup, &weights).unwrap(); } let slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); @@ -194,7 +197,11 @@ fn bench_lde_batched_prover_scale() { let t0 = Instant::now(); cpu_bufs.par_iter_mut().for_each(|buf| { Polynomial::coset_lde_full_expand::( - buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + buf, + blowup, + &weights_fp, + &inv_tw, + &fwd_tw, ) .unwrap(); }); @@ -227,15 +234,12 @@ fn bench_lde_batched_vs_rayon_cpu() { // one-time pinned staging alloc cost. let warm_slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); for _ in 0..64 { - let _ = - math_cuda::lde::coset_lde_batch_base(&warm_slices, blowup, &weights).unwrap(); + let _ = math_cuda::lde::coset_lde_batch_base(&warm_slices, blowup, &weights).unwrap(); } let weights_fp: Vec = weights.iter().map(|&w| Fp::from_raw(w)).collect(); let inv_tw = LayerTwiddles::::new_inverse(log_n as u64).unwrap(); - let fwd_tw = LayerTwiddles::::new( - (n * blowup).trailing_zeros() as u64, - ) - .unwrap(); + let fwd_tw = + LayerTwiddles::::new((n * blowup).trailing_zeros() as u64).unwrap(); // GPU batched — first run may include lazy device init; do a few to stabilise. let slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); @@ -254,7 +258,11 @@ fn bench_lde_batched_vs_rayon_cpu() { let t0 = Instant::now(); cpu_bufs.par_iter_mut().for_each(|buf| { Polynomial::coset_lde_full_expand::( - buf, blowup, &weights_fp, &inv_tw, &fwd_tw, + buf, + blowup, + &weights_fp, + &inv_tw, + &fwd_tw, ) .unwrap(); }); @@ -277,12 +285,7 @@ fn bench_lde_multi_column_serialized_gpu() { let n = 1usize << log_n; let num_cols = 64; - let _ = math_cuda::lde::coset_lde_base( - &vec![0u64; n], - blowup, - &coset_weights(n, 7), - ) - .unwrap(); + let _ = math_cuda::lde::coset_lde_base(&vec![0u64; n], blowup, &coset_weights(n, 7)).unwrap(); let mut rng = ChaCha8Rng::seed_from_u64(13); let columns: Vec> = (0..num_cols) @@ -320,12 +323,7 @@ fn bench_lde_multi_column_gpu_limited_threads() { let n = 1usize << log_n; let num_cols = 64; - let _ = math_cuda::lde::coset_lde_base( - &vec![0u64; n], - blowup, - &coset_weights(n, 7), - ) - .unwrap(); + let _ = math_cuda::lde::coset_lde_base(&vec![0u64; n], blowup, &coset_weights(n, 7)).unwrap(); let mut rng = ChaCha8Rng::seed_from_u64(12); let columns: Vec> = (0..num_cols) diff --git a/crypto/math-cuda/tests/evaluate_coset_ext3.rs b/crypto/math-cuda/tests/evaluate_coset_ext3.rs index a79195291..020c39241 100644 --- a/crypto/math-cuda/tests/evaluate_coset_ext3.rs +++ b/crypto/math-cuda/tests/evaluate_coset_ext3.rs @@ -81,13 +81,8 @@ fn assert_evaluate_coset(log_n: u64, blowup: usize, m: usize, offset: u64, seed: .iter() .map(|coefs| { let p = Polynomial::new(coefs); - Polynomial::evaluate_offset_fft::( - &p, - blowup, - Some(n), - &offset_fp, - ) - .unwrap() + Polynomial::evaluate_offset_fft::(&p, blowup, Some(n), &offset_fp) + .unwrap() }) .collect(); @@ -114,7 +109,10 @@ fn assert_evaluate_coset(log_n: u64, blowup: usize, m: usize, offset: u64, seed: for i in 0..gpu.len() { let g = canon_fp3(&gpu[i]); let cc = canon_fp3(&cpu[c][i]); - assert_eq!(g, cc, "eval mismatch col={c} row={i} log_n={log_n} blowup={blowup}"); + assert_eq!( + g, cc, + "eval mismatch col={c} row={i} log_n={log_n} blowup={blowup}" + ); } } } diff --git a/crypto/math-cuda/tests/goldilocks.rs b/crypto/math-cuda/tests/goldilocks.rs index 317ffb0f8..37a5d8533 100644 --- a/crypto/math-cuda/tests/goldilocks.rs +++ b/crypto/math-cuda/tests/goldilocks.rs @@ -112,12 +112,15 @@ fn gpu_goldilocks_edge_cases() { } } - let cases: &[(&str, fn(&[u64], &[u64]) -> math_cuda::Result>, fn(&u64, &u64) -> u64)] = - &[ - ("gl_add", math_cuda::gl_add_u64, GoldilocksField::add), - ("gl_sub", math_cuda::gl_sub_u64, GoldilocksField::sub), - ("gl_mul", math_cuda::gl_mul_u64, GoldilocksField::mul), - ]; + let cases: &[( + &str, + fn(&[u64], &[u64]) -> math_cuda::Result>, + fn(&u64, &u64) -> u64, + )] = &[ + ("gl_add", math_cuda::gl_add_u64, GoldilocksField::add), + ("gl_sub", math_cuda::gl_sub_u64, GoldilocksField::sub), + ("gl_mul", math_cuda::gl_mul_u64, GoldilocksField::mul), + ]; for (op, gpu_fn, cpu_fn) in cases { let expected: Vec = a.iter().zip(&b).map(|(x, y)| cpu_fn(x, y)).collect(); diff --git a/crypto/math-cuda/tests/lde.rs b/crypto/math-cuda/tests/lde.rs index 9648f833a..75ea80867 100644 --- a/crypto/math-cuda/tests/lde.rs +++ b/crypto/math-cuda/tests/lde.rs @@ -69,7 +69,11 @@ fn assert_lde_match(log_n: u64, blowup_factor: usize, seed: u64) { let cpu = cpu_lde(&evals, blowup_factor, coset_offset); let gpu = math_cuda::lde::coset_lde_base(&evals, blowup_factor, &weights).expect("gpu lde"); - assert_eq!(cpu.len(), gpu.len(), "length mismatch (log_n={log_n}, blowup={blowup_factor})"); + assert_eq!( + cpu.len(), + gpu.len(), + "length mismatch (log_n={log_n}, blowup={blowup_factor})" + ); let cpu_c = canon(&cpu); let gpu_c = canon(&gpu); for (i, (e, a)) in cpu_c.iter().zip(&gpu_c).enumerate() { diff --git a/crypto/math-cuda/tests/lde_batch.rs b/crypto/math-cuda/tests/lde_batch.rs index 67f975728..153e5d3e5 100644 --- a/crypto/math-cuda/tests/lde_batch.rs +++ b/crypto/math-cuda/tests/lde_batch.rs @@ -25,7 +25,13 @@ fn coset_weights(n: usize, g: u64) -> Vec { w } -fn cpu_lde_one(col: &[u64], blowup: usize, weights_fp: &[Fp], inv_tw: &LayerTwiddles, fwd_tw: &LayerTwiddles) -> Vec { +fn cpu_lde_one( + col: &[u64], + blowup: usize, + weights_fp: &[Fp], + inv_tw: &LayerTwiddles, + fwd_tw: &LayerTwiddles, +) -> Vec { let mut buf: Vec = col.iter().map(|&x| Fp::from_raw(x)).collect(); Polynomial::coset_lde_full_expand::( &mut buf, blowup, weights_fp, inv_tw, fwd_tw, diff --git a/crypto/math-cuda/tests/lde_batch_ext3.rs b/crypto/math-cuda/tests/lde_batch_ext3.rs index 0a86197a5..ff19f6c56 100644 --- a/crypto/math-cuda/tests/lde_batch_ext3.rs +++ b/crypto/math-cuda/tests/lde_batch_ext3.rs @@ -97,8 +97,7 @@ fn assert_ext3_batch(log_n: u64, blowup: usize, m: usize, seed: u64) { let input_slices: Vec<&[u64]> = flat_inputs.iter().map(|v| v.as_slice()).collect(); // Pre-allocate outputs, each 3*lde_size u64s. - let mut flat_outputs: Vec> = - (0..m).map(|_| vec![0u64; 3 * lde_size]).collect(); + let mut flat_outputs: Vec> = (0..m).map(|_| vec![0u64; 3 * lde_size]).collect(); { let mut out_slices: Vec<&mut [u64]> = flat_outputs.iter_mut().map(|v| v.as_mut_slice()).collect(); @@ -132,7 +131,13 @@ fn assert_ext3_batch(log_n: u64, blowup: usize, m: usize, seed: u64) { } // Also sanity-check raw canonical equality per column. for (c, col) in columns.iter().enumerate() { - let cpu_raw = ext3_to_u64s(&cpu_lde_one_ext3(col, blowup, &weights_fp, &inv_tw, &fwd_tw)); + let cpu_raw = ext3_to_u64s(&cpu_lde_one_ext3( + col, + blowup, + &weights_fp, + &inv_tw, + &fwd_tw, + )); assert_eq!(canon(&cpu_raw), canon(&flat_outputs[c])); } } diff --git a/crypto/math-cuda/tests/ntt.rs b/crypto/math-cuda/tests/ntt.rs index d7cf3680a..b6697b82d 100644 --- a/crypto/math-cuda/tests/ntt.rs +++ b/crypto/math-cuda/tests/ntt.rs @@ -21,9 +21,7 @@ fn cpu_fft(coeffs: &[u64]) -> Vec { } fn canonicalize(xs: &[u64]) -> Vec { - xs.iter() - .map(|x| GoldilocksField::canonical(x)) - .collect() + xs.iter().map(|x| GoldilocksField::canonical(x)).collect() } fn assert_ntt_match(log_n: u64, seed: u64) { @@ -81,9 +79,12 @@ fn assert_intt_match(log_n: u64, seed: u64) { let evals: Vec = (0..n).map(|_| rng.r#gen::()).collect(); let elems: Vec = evals.iter().map(|&x| Fp::from_raw(x)).collect(); - let cpu_poly = - Polynomial::interpolate_fft::(&elems).expect("cpu intt"); - let cpu: Vec = cpu_poly.coefficients.into_iter().map(|e| *e.value()).collect(); + let cpu_poly = Polynomial::interpolate_fft::(&elems).expect("cpu intt"); + let cpu: Vec = cpu_poly + .coefficients + .into_iter() + .map(|e| *e.value()) + .collect(); let gpu = math_cuda::ntt::inverse(&evals).expect("gpu intt"); @@ -124,7 +125,9 @@ fn ntt_round_trip() { let log_n = 14; let n = 1usize << log_n; let mut rng = ChaCha8Rng::seed_from_u64(42); - let x: Vec = (0..n).map(|_| rng.r#gen::() % 0xFFFF_FFFF_0000_0001).collect(); + let x: Vec = (0..n) + .map(|_| rng.r#gen::() % 0xFFFF_FFFF_0000_0001) + .collect(); let evals = math_cuda::ntt::forward(&x).expect("forward"); let back = math_cuda::ntt::inverse(&evals).expect("inverse"); @@ -133,4 +136,3 @@ fn ntt_round_trip() { let back_c = canonicalize(&back); assert_eq!(x_c, back_c, "round trip failed"); } - From ac6fbb5972cc4cd4e228bd1adb2e659760c7b6e1 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Wed, 6 May 2026 15:55:58 -0300 Subject: [PATCH 3/7] fix clippy --- crypto/math-cuda/build.rs | 34 +++++++++++++++++-- crypto/math-cuda/src/lde.rs | 20 +++++++---- crypto/math-cuda/tests/evaluate_coset_ext3.rs | 2 +- crypto/math-cuda/tests/goldilocks.rs | 10 +++--- crypto/math-cuda/tests/lde.rs | 2 +- crypto/math-cuda/tests/lde_batch.rs | 2 +- crypto/math-cuda/tests/lde_batch_ext3.rs | 4 +-- crypto/math-cuda/tests/ntt.rs | 2 +- 8 files changed, 54 insertions(+), 22 deletions(-) diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs index d6b803bed..43edaa31c 100644 --- a/crypto/math-cuda/build.rs +++ b/crypto/math-cuda/build.rs @@ -1,4 +1,5 @@ use std::env; +use std::fs; use std::path::PathBuf; use std::process::Command; @@ -13,7 +14,7 @@ fn nvcc_path() -> PathBuf { cuda_home().join("bin").join("nvcc") } -fn compile_ptx(src: &str, out_name: &str) { +fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) { let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); let src_path = manifest_dir.join("kernels").join(src); @@ -24,6 +25,16 @@ fn compile_ptx(src: &str, out_name: &str) { println!("cargo:rerun-if-env-changed=CUDA_PATH"); println!("cargo:rerun-if-env-changed=CUDARC_NVCC_ARCH"); + // No nvcc on PATH → emit an empty PTX stub so the crate still compiles. + // include_str! in src/device.rs needs the file to exist at build time. + // Any runtime kernel call will then panic from cudarc when loading the + // empty module — which is the right failure mode (we can't run GPU code + // without nvcc on the build host anyway). + if !have_nvcc { + fs::write(&out_path, "").expect("failed to write empty PTX stub"); + return; + } + // Emit PTX for a virtual architecture; the CUDA driver JIT-compiles it for the // actual GPU at load time, so one PTX works across Ada/Hopper/Blackwell. Override // with CUDARC_NVCC_ARCH to pin a specific compute capability. @@ -45,6 +56,23 @@ fn main() { // Headers are not compiled; emit rerun-if-changed so edits trigger rebuilds. println!("cargo:rerun-if-changed=kernels/goldilocks.cuh"); println!("cargo:rerun-if-changed=kernels/ext3.cuh"); - compile_ptx("arith.cu", "arith.ptx"); - compile_ptx("ntt.cu", "ntt.ptx"); + + // Probe for nvcc once. Workspace consumers (clippy, fmt, CPU-only test + // runners) build math-cuda incidentally without using its kernels; allow + // those to succeed by stubbing out PTX when nvcc is unavailable. + let have_nvcc = Command::new(nvcc_path()) + .arg("--version") + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + if !have_nvcc { + println!( + "cargo:warning=math-cuda: nvcc not found at {} — emitting empty PTX stubs. \ + Runtime GPU calls will panic. Install CUDA and rebuild for a working backend.", + nvcc_path().display() + ); + } + + compile_ptx("arith.cu", "arith.ptx", have_nvcc); + compile_ptx("ntt.cu", "ntt.ptx", have_nvcc); } diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 8b9da522c..22d62ca3e 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -329,13 +329,19 @@ pub fn coset_lde_batch_base( // Split pinned → per-column Vecs. The first write to each virgin // Vec page-faults, which can dominate total time (~75 ms for 128 MB). // Parallelise so the fault cost spreads across CPU cores. - use rayon::prelude::*; let pinned_ptr = pinned.as_ptr() as usize; let out: Vec> = (0..m) .into_par_iter() .map(|c| { - let mut v = Vec::::with_capacity(lde_size); - unsafe { v.set_len(lde_size) }; + // set_len skips the O(N) zero-init that vec![0; n] would do + // (saves ~75 ms per 128 MB at prover scale). copy_from_slice + // below writes every slot before any reader sees the Vec. + #[allow(clippy::uninit_vec)] + let mut v = { + let mut v = Vec::::with_capacity(lde_size); + unsafe { v.set_len(lde_size) }; + v + }; let src = unsafe { std::slice::from_raw_parts((pinned_ptr as *const u64).add(c * lde_size), lde_size) }; @@ -772,7 +778,7 @@ pub fn coset_lde_batch_ext3_into( columns.par_iter().enumerate().for_each(|(c, col)| { // SAFETY: disjoint regions per c; staging lock held. let slab_a = unsafe { - std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 0) * n), n) + std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3) * n), n) }; let slab_b = unsafe { std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 1) * n), n) @@ -781,7 +787,7 @@ pub fn coset_lde_batch_ext3_into( std::slice::from_raw_parts_mut((pinned_ptr_u as *mut u64).add((c * 3 + 2) * n), n) }; for i in 0..n { - slab_a[i] = col[i * 3 + 0]; + slab_a[i] = col[i * 3]; slab_b[i] = col[i * 3 + 1]; slab_c[i] = col[i * 3 + 2]; } @@ -885,7 +891,7 @@ pub fn coset_lde_batch_ext3_into( outputs.par_iter_mut().enumerate().for_each(|(c, dst)| { let slab_a = unsafe { std::slice::from_raw_parts( - (pinned_const as *const u64).add((c * 3 + 0) * lde_size), + (pinned_const as *const u64).add((c * 3) * lde_size), lde_size, ) }; @@ -902,7 +908,7 @@ pub fn coset_lde_batch_ext3_into( ) }; for i in 0..lde_size { - dst[i * 3 + 0] = slab_a[i]; + dst[i * 3] = slab_a[i]; dst[i * 3 + 1] = slab_b[i]; dst[i * 3 + 2] = slab_c[i]; } diff --git a/crypto/math-cuda/tests/evaluate_coset_ext3.rs b/crypto/math-cuda/tests/evaluate_coset_ext3.rs index 020c39241..334b2601d 100644 --- a/crypto/math-cuda/tests/evaluate_coset_ext3.rs +++ b/crypto/math-cuda/tests/evaluate_coset_ext3.rs @@ -47,7 +47,7 @@ fn u64s_to_ext3(raw: &[u64]) -> Vec { let mut out = Vec::with_capacity(raw.len() / 3); for i in 0..raw.len() / 3 { out.push(Fp3::new([ - Fp::from_raw(raw[i * 3 + 0]), + Fp::from_raw(raw[i * 3]), Fp::from_raw(raw[i * 3 + 1]), Fp::from_raw(raw[i * 3 + 2]), ])); diff --git a/crypto/math-cuda/tests/goldilocks.rs b/crypto/math-cuda/tests/goldilocks.rs index 37a5d8533..33b7bf5e7 100644 --- a/crypto/math-cuda/tests/goldilocks.rs +++ b/crypto/math-cuda/tests/goldilocks.rs @@ -78,7 +78,7 @@ fn gpu_gl_mul_matches_cpu() { #[test] fn gpu_gl_neg_matches_cpu() { let a = sample_inputs(7); - let expected: Vec = a.iter().map(|x| GoldilocksField::neg(x)).collect(); + let expected: Vec = a.iter().map(GoldilocksField::neg).collect(); let actual = math_cuda::gl_neg_u64(&a).expect("GPU gl_neg"); assert_raw_eq("gl_neg", &expected, &actual); } @@ -112,11 +112,9 @@ fn gpu_goldilocks_edge_cases() { } } - let cases: &[( - &str, - fn(&[u64], &[u64]) -> math_cuda::Result>, - fn(&u64, &u64) -> u64, - )] = &[ + type GpuOp = fn(&[u64], &[u64]) -> math_cuda::Result>; + type CpuOp = fn(&u64, &u64) -> u64; + let cases: &[(&str, GpuOp, CpuOp)] = &[ ("gl_add", math_cuda::gl_add_u64, GoldilocksField::add), ("gl_sub", math_cuda::gl_sub_u64, GoldilocksField::sub), ("gl_mul", math_cuda::gl_mul_u64, GoldilocksField::mul), diff --git a/crypto/math-cuda/tests/lde.rs b/crypto/math-cuda/tests/lde.rs index 75ea80867..facd2d861 100644 --- a/crypto/math-cuda/tests/lde.rs +++ b/crypto/math-cuda/tests/lde.rs @@ -52,7 +52,7 @@ fn cpu_lde(evals: &[u64], blowup_factor: usize, coset_offset: u64) -> Vec { } fn canon(xs: &[u64]) -> Vec { - xs.iter().map(|x| GoldilocksField::canonical(x)).collect() + xs.iter().map(GoldilocksField::canonical).collect() } fn assert_lde_match(log_n: u64, blowup_factor: usize, seed: u64) { diff --git a/crypto/math-cuda/tests/lde_batch.rs b/crypto/math-cuda/tests/lde_batch.rs index 153e5d3e5..b7120ca28 100644 --- a/crypto/math-cuda/tests/lde_batch.rs +++ b/crypto/math-cuda/tests/lde_batch.rs @@ -41,7 +41,7 @@ fn cpu_lde_one( } fn canon(xs: &[u64]) -> Vec { - xs.iter().map(|x| GoldilocksField::canonical(x)).collect() + xs.iter().map(GoldilocksField::canonical).collect() } fn assert_batch(log_n: u64, blowup: usize, m: usize, seed: u64) { diff --git a/crypto/math-cuda/tests/lde_batch_ext3.rs b/crypto/math-cuda/tests/lde_batch_ext3.rs index ff19f6c56..c1156edaa 100644 --- a/crypto/math-cuda/tests/lde_batch_ext3.rs +++ b/crypto/math-cuda/tests/lde_batch_ext3.rs @@ -51,7 +51,7 @@ fn u64s_to_ext3(raw: &[u64]) -> Vec { let mut out = Vec::with_capacity(raw.len() / 3); for i in 0..raw.len() / 3 { out.push(Fp3::new([ - Fp::from_raw(raw[i * 3 + 0]), + Fp::from_raw(raw[i * 3]), Fp::from_raw(raw[i * 3 + 1]), Fp::from_raw(raw[i * 3 + 2]), ])); @@ -75,7 +75,7 @@ fn cpu_lde_one_ext3( } fn canon(xs: &[u64]) -> Vec { - xs.iter().map(|x| GoldilocksField::canonical(x)).collect() + xs.iter().map(GoldilocksField::canonical).collect() } fn assert_ext3_batch(log_n: u64, blowup: usize, m: usize, seed: u64) { diff --git a/crypto/math-cuda/tests/ntt.rs b/crypto/math-cuda/tests/ntt.rs index b6697b82d..f3689cf94 100644 --- a/crypto/math-cuda/tests/ntt.rs +++ b/crypto/math-cuda/tests/ntt.rs @@ -21,7 +21,7 @@ fn cpu_fft(coeffs: &[u64]) -> Vec { } fn canonicalize(xs: &[u64]) -> Vec { - xs.iter().map(|x| GoldilocksField::canonical(x)).collect() + xs.iter().map(GoldilocksField::canonical).collect() } fn assert_ntt_match(log_n: u64, seed: u64) { From 0944a8326b2139a7e2cbe3a3fd643fd69d73706f Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Fri, 8 May 2026 12:28:34 -0300 Subject: [PATCH 4/7] fix comments --- crypto/math-cuda/kernels/arith.cu | 2 +- crypto/math-cuda/tests/lde.rs | 2 +- crypto/math-cuda/tests/ntt.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto/math-cuda/kernels/arith.cu b/crypto/math-cuda/kernels/arith.cu index 4bee9b8bb..ac73f7fd1 100644 --- a/crypto/math-cuda/kernels/arith.cu +++ b/crypto/math-cuda/kernels/arith.cu @@ -1,4 +1,4 @@ -// Element-wise Goldilocks kernels used by the Phase-2 parity tests. These mirror +// Element-wise Goldilocks kernels used by the parity tests. These mirror // the CPU reference in `crypto/math/src/field/goldilocks.rs` so raw u64 outputs // are bit-identical to the CPU path. diff --git a/crypto/math-cuda/tests/lde.rs b/crypto/math-cuda/tests/lde.rs index facd2d861..33f98f9ae 100644 --- a/crypto/math-cuda/tests/lde.rs +++ b/crypto/math-cuda/tests/lde.rs @@ -1,4 +1,4 @@ -//! Phase-5 parity: GPU `coset_lde_base` must match the CPU +//! Parity: GPU `coset_lde_base` must match the CPU //! `Polynomial::coset_lde_full_expand` for a sweep of realistic sizes and //! blowup factors. diff --git a/crypto/math-cuda/tests/ntt.rs b/crypto/math-cuda/tests/ntt.rs index f3689cf94..17a556c74 100644 --- a/crypto/math-cuda/tests/ntt.rs +++ b/crypto/math-cuda/tests/ntt.rs @@ -1,4 +1,4 @@ -//! Phase-3 parity: GPU forward NTT must agree with `Polynomial::evaluate_fft` +//! Parity: GPU forward NTT must agree with `Polynomial::evaluate_fft` //! as a field element, across a sweep of sizes from 2^4 to 2^20. //! //! Non-canonical u64s can differ between CPU and GPU while representing the From 12292ceeb486f838c9f05436febaefd26e5c2ab5 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Fri, 8 May 2026 12:35:28 -0300 Subject: [PATCH 5/7] fix comments --- crypto/math-cuda/kernels/ntt.cu | 10 ++++++---- crypto/math-cuda/src/device.rs | 5 +++-- crypto/math-cuda/src/lde.rs | 4 ---- crypto/math-cuda/tests/bench_quick.rs | 9 +++++---- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/crypto/math-cuda/kernels/ntt.cu b/crypto/math-cuda/kernels/ntt.cu index 2a5c8c786..a8e50d4e1 100644 --- a/crypto/math-cuda/kernels/ntt.cu +++ b/crypto/math-cuda/kernels/ntt.cu @@ -1,5 +1,6 @@ -// Radix-2 DIT NTT over Goldilocks. One kernel per butterfly level; the caller -// runs `bit_reverse_permute` once before the first level. +// Radix-2 DIT NTT over Goldilocks: per-level, fused 8-level (shmem), and +// batched (multi-column) variants. The caller runs `bit_reverse_permute` +// once before the first butterfly level. // // Input layout: bit-reversed-order coefficients (after `bit_reverse_permute`). // Output layout: natural-order evaluations — matches the CPU `evaluate_fft` contract. @@ -179,8 +180,9 @@ extern "C" __global__ void scalar_mul_batched(uint64_t *data, /// One DIT butterfly level. Thread `tid` (of n/2 total) owns exactly one /// butterfly pair (i0, i1 = i0 + half). Twiddle picked from the shared full -/// `tw` table at stride `n / block_size`. Kept for log_n < 8 where shmem -/// fusion is overkill. +/// `tw` table at stride `n / block_size`. Used for levels 0..7 when n < 256 +/// (shmem fusion needs at least 256 elements), and for levels >= 8 of any +/// size (above the shmem-fusion window). extern "C" __global__ void ntt_dit_level(uint64_t *x, const uint64_t *tw, uint64_t n, diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 03f0b67ca..efdaf1518 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -15,8 +15,9 @@ use math::field::traits::IsFFTField; use crate::Result; use crate::ntt::{twiddles_forward, twiddles_inverse}; -/// Reusable pinned host staging buffer. One per stream; the stream's LDE call -/// holds its buffer's lock across the D2H + memcpy-to-user-Vecs window. +/// Reusable pinned host staging buffer. Shared across all streams via a +/// `Mutex` (see `Backend::pinned_staging`); the LDE call holds the lock +/// across the D2H + memcpy-to-user-Vecs window. /// /// Allocated with `cuMemHostAlloc(flags=0)` — portable, non-write-combined, /// so both DMA writes from device and CPU reads into user Vecs run at full diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 22d62ca3e..5f1b95f34 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -517,10 +517,6 @@ pub fn coset_lde_batch_base_into( /// Skips the iFFT stage of [`coset_lde_batch_ext3_into`] (input is /// coefficients, not evaluations). Weights encode the coset shift: /// `weights[k] = offset^k` (NO 1/N because iFFT normalisation doesn't apply). -/// -/// Used by the stark prover to GPU-accelerate -/// `evaluate_polynomial_on_lde_domain` calls inside the -/// `number_of_parts > 2` branch of the composition-polynomial LDE. pub fn evaluate_poly_coset_batch_ext3_into( coefs: &[&[u64]], n: usize, diff --git a/crypto/math-cuda/tests/bench_quick.rs b/crypto/math-cuda/tests/bench_quick.rs index 41e9444c3..9e3883085 100644 --- a/crypto/math-cuda/tests/bench_quick.rs +++ b/crypto/math-cuda/tests/bench_quick.rs @@ -99,8 +99,9 @@ fn bench_lde_2_to_16_blowup_4() { #[test] #[ignore = "informal perf probe; run with --ignored"] fn bench_lde_multi_column_parallel() { - // Simulates the prover's Phase A: many columns processed via rayon. - // log_n = 16 keeps memory footprint manageable while still stressing streams. + // Simulates a multi-column workload processed via rayon: many columns + // dispatched concurrently to stress the stream pool. log_n = 16 keeps + // memory footprint manageable. let log_n = 16u32; let blowup = 4usize; let n = 1usize << log_n; @@ -156,8 +157,8 @@ fn bench_lde_multi_column_parallel() { #[test] #[ignore = "informal perf probe; run with --ignored"] fn bench_lde_batched_prover_scale() { - // Realistic large-table shape from the 1M-fib prover: ~1M rows, blowup 4, - // a few dozen columns. This is what actually runs in expand_columns_to_lde. + // Realistic large-table shape: ~1M rows, blowup 4, a few dozen columns. + // Exercises batched LDE at prover-scale sizes. let log_n = 20u32; // 1M rows let blowup = 4usize; let n = 1usize << log_n; From 6d4c65427da360a86e782b0795acaa257dcf707b Mon Sep 17 00:00:00 2001 From: MauroFab Date: Tue, 12 May 2026 15:40:53 +0200 Subject: [PATCH 6/7] test(math-cuda): adversarial ext3, known-poly NTT, _into parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add ext3_sub_kernel + ext3_sub_u64 wrapper (test infrastructure; the ext3::sub device function was previously unreachable from Rust). - Add tests/ext3_edge.rs: 7 adversarial tests for ext3::mul dot3 overflow tracking — (p-1)^3, u64::MAX^3, non-canonical p representations, identity, and 98 base-field edge pairs in a/b/c slots. - Add tests/ext3_sub.rs: parity test for the new sub wrapper. - Add tests/ntt_known.rs: known-polynomial tests for p(x) = 1+x at sizes 16 and 256, and p(x) = x^(N/2) for the alternating ±1 pattern. - Add tests/lde_batch_into.rs: direct parity test for coset_lde_batch_base_into vs coset_lde_batch_base. 15 new tests total, all green on RTX 5090. --- crypto/math-cuda/kernels/arith.cu | 14 ++ crypto/math-cuda/src/device.rs | 2 + crypto/math-cuda/src/lib.rs | 29 ++++ crypto/math-cuda/tests/ext3_edge.rs | 176 +++++++++++++++++++++++ crypto/math-cuda/tests/ext3_sub.rs | 109 ++++++++++++++ crypto/math-cuda/tests/lde_batch_into.rs | 87 +++++++++++ crypto/math-cuda/tests/ntt_known.rs | 125 ++++++++++++++++ 7 files changed, 542 insertions(+) create mode 100644 crypto/math-cuda/tests/ext3_edge.rs create mode 100644 crypto/math-cuda/tests/ext3_sub.rs create mode 100644 crypto/math-cuda/tests/lde_batch_into.rs create mode 100644 crypto/math-cuda/tests/ntt_known.rs diff --git a/crypto/math-cuda/kernels/arith.cu b/crypto/math-cuda/kernels/arith.cu index ac73f7fd1..b1a6bb8ab 100644 --- a/crypto/math-cuda/kernels/arith.cu +++ b/crypto/math-cuda/kernels/arith.cu @@ -81,3 +81,17 @@ extern "C" __global__ void ext3_add_kernel(const uint64_t *a_int, c_int[tid*3 + 1] = r.b; c_int[tid*3 + 2] = r.c; } + +extern "C" __global__ void ext3_sub_kernel(const uint64_t *a_int, + const uint64_t *b_int, + uint64_t *c_int, + uint64_t n) { + uint64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + ext3::Fe3 a = ext3::make(a_int[tid*3 + 0], a_int[tid*3 + 1], a_int[tid*3 + 2]); + ext3::Fe3 b = ext3::make(b_int[tid*3 + 0], b_int[tid*3 + 1], b_int[tid*3 + 2]); + ext3::Fe3 r = ext3::sub(a, b); + c_int[tid*3 + 0] = r.a; + c_int[tid*3 + 1] = r.b; + c_int[tid*3 + 2] = r.c; +} diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index efdaf1518..af5105ee5 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -118,6 +118,7 @@ pub struct Backend { pub gl_neg: CudaFunction, pub ext3_mul: CudaFunction, pub ext3_add: CudaFunction, + pub ext3_sub: CudaFunction, // ntt.ptx pub bit_reverse_permute: CudaFunction, @@ -169,6 +170,7 @@ impl Backend { gl_neg: arith.load_function("gl_neg_kernel")?, ext3_mul: arith.load_function("ext3_mul_kernel")?, ext3_add: arith.load_function("ext3_add_kernel")?, + ext3_sub: arith.load_function("ext3_sub_kernel")?, bit_reverse_permute: ntt.load_function("bit_reverse_permute")?, ntt_dit_level: ntt.load_function("ntt_dit_level")?, ntt_dit_8_levels: ntt.load_function("ntt_dit_8_levels")?, diff --git a/crypto/math-cuda/src/lib.rs b/crypto/math-cuda/src/lib.rs index 821f5bd3a..04ec08b1d 100644 --- a/crypto/math-cuda/src/lib.rs +++ b/crypto/math-cuda/src/lib.rs @@ -89,6 +89,35 @@ pub fn ext3_mul_u64(a: &[u64], b: &[u64]) -> Result> { Ok(out) } +/// Element-wise ext3 subtract. Test helper for `ext3::sub` in `ext3.cuh`. +pub fn ext3_sub_u64(a: &[u64], b: &[u64]) -> Result> { + assert_eq!(a.len(), b.len()); + assert_eq!(a.len() % 3, 0); + let n = a.len() / 3; + if n == 0 { + return Ok(Vec::new()); + } + let be = backend(); + let stream = be.next_stream(); + let a_dev = stream.clone_htod(a)?; + let b_dev = stream.clone_htod(b)?; + let mut c_dev = stream.alloc_zeros::(3 * n)?; + let cfg = LaunchConfig::for_num_elems(n as u32); + let n_u64 = n as u64; + unsafe { + stream + .launch_builder(&be.ext3_sub) + .arg(&a_dev) + .arg(&b_dev) + .arg(&mut c_dev) + .arg(&n_u64) + .launch(cfg)?; + } + let out = stream.clone_dtoh(&c_dev)?; + stream.synchronize()?; + Ok(out) +} + /// Element-wise ext3 add. pub fn ext3_add_u64(a: &[u64], b: &[u64]) -> Result> { assert_eq!(a.len(), b.len()); diff --git a/crypto/math-cuda/tests/ext3_edge.rs b/crypto/math-cuda/tests/ext3_edge.rs new file mode 100644 index 000000000..55b20bc92 --- /dev/null +++ b/crypto/math-cuda/tests/ext3_edge.rs @@ -0,0 +1,176 @@ +//! Adversarial edge-case parity tests for `ext3::mul` on the GPU. +//! +//! The CUDA `dot3` (kernels/ext3.cuh:62-98) manually tracks overflow when +//! summing three u128 products in split u64 hi/lo registers. The CPU +//! reference (`crypto/math/src/field/goldilocks.rs::dot_product_3` via +//! `Degree3GoldilocksExtensionField::mul`) uses native u128 and so reaches +//! the same answer via a totally different code path. These tests pick +//! inputs that maximally stress the overflow-count tracking, the +//! non-canonical input handling, and the identity/zero cases that random +//! tests are unlikely to cover. + +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +const P: u64 = 0xFFFF_FFFF_0000_0001; // Goldilocks prime +const EPSILON: u64 = 0xFFFF_FFFF; // 2^32 - 1 + +fn canon(x: u64) -> u64 { + GoldilocksField::canonical(&x) +} + +fn canon3(t: [u64; 3]) -> [u64; 3] { + [canon(t[0]), canon(t[1]), canon(t[2])] +} + +/// Run `pairs` of (a, b) through the GPU `ext3_mul_u64` and compare +/// canonical output to the CPU `Degree3GoldilocksExtensionField::mul`. +/// `label_fn(i)` produces a per-case label printed on failure. +fn assert_ext3_mul_pairs(pairs: &[([u64; 3], [u64; 3])], label_fn: impl Fn(usize) -> String) { + let mut a_raw = Vec::with_capacity(pairs.len() * 3); + let mut b_raw = Vec::with_capacity(pairs.len() * 3); + for (a, b) in pairs { + a_raw.extend_from_slice(a); + b_raw.extend_from_slice(b); + } + let gpu = math_cuda::ext3_mul_u64(&a_raw, &b_raw).expect("GPU ext3 mul launch"); + assert_eq!(gpu.len(), 3 * pairs.len()); + for (i, (a, b)) in pairs.iter().enumerate() { + // CPU reference. Build via from_raw so non-canonical inputs (like + // u64::MAX or p, p+1) are passed in untouched — matching what the + // GPU sees. + let ae = [Fp::from_raw(a[0]), Fp::from_raw(a[1]), Fp::from_raw(a[2])]; + let be = [Fp::from_raw(b[0]), Fp::from_raw(b[1]), Fp::from_raw(b[2])]; + let cpu = Degree3GoldilocksExtensionField::mul(&ae, &be); + let cpu_fp3 = Fp3::new(cpu); + let g = canon3([gpu[3 * i], gpu[3 * i + 1], gpu[3 * i + 2]]); + let c = canon3([ + *cpu_fp3.value()[0].value(), + *cpu_fp3.value()[1].value(), + *cpu_fp3.value()[2].value(), + ]); + assert_eq!( + g, + c, + "ext3 mul mismatch [{}]: a={:?} b={:?} gpu={:?} cpu={:?}", + label_fn(i), + a, + b, + g, + c, + ); + } +} + +#[test] +fn ext3_mul_max_canonical_inputs() { + // (p-1, p-1, p-1) * (p-1, p-1, p-1) — every base limb is (p-1), every + // dot3 product a_i*b_i = (p-1)^2 ~ 2^128, so summing three of them + // forces the overflow path twice on each component. + let m1 = [P - 1, P - 1, P - 1]; + let pairs = vec![(m1, m1)]; + assert_ext3_mul_pairs(&pairs, |_| "(p-1)^3 squared".into()); +} + +#[test] +fn ext3_mul_zero_cases() { + // (0,0,0) * (p-1,p-1,p-1) must be zero; covers the "all-zero a" path + // where every dot3 product is zero and no overflow occurs. + let z = [0u64, 0, 0]; + let m = [P - 1, P - 1, P - 1]; + let pairs = vec![(z, m), (m, z), (z, z)]; + assert_ext3_mul_pairs(&pairs, |i| format!("zero case {i}")); +} + +#[test] +fn ext3_mul_identity() { + // (1, 0, 0) * (a, b, c) == (a, b, c). One-component is multiplicative + // identity in Fp[w]/(w^3 - 2). Use varied b to also exercise small + // non-zero dot3 products. + let id = [1u64, 0, 0]; + let cases: Vec<([u64; 3], [u64; 3])> = vec![ + (id, [0, 0, 0]), + (id, [1, 0, 0]), + (id, [0, 1, 0]), + (id, [0, 0, 1]), + (id, [P - 1, 1, 2]), + (id, [123, 456, 789]), + (id, [P - 1, P - 1, P - 1]), + // Reverse order: (a, b, c) * (1, 0, 0). + ([0, 0, 0], id), + ([1, 0, 0], id), + ([0, 1, 0], id), + ([0, 0, 1], id), + ([P - 1, 1, 2], id), + ([123, 456, 789], id), + ]; + assert_ext3_mul_pairs(&cases, |i| format!("identity case {i}")); +} + +#[test] +fn ext3_mul_non_canonical_zero_p() { + // (p, p, p) is a non-canonical representation of (0, 0, 0). The CPU + // canonicalises before the dot3 in some code paths, the GPU does not. + // Either way the product must canonicalise to zero. + let p = [P, P, P]; + let some = [123u64, 456, 789]; + let pairs = vec![(p, p), (p, some), (some, p)]; + assert_ext3_mul_pairs(&pairs, |i| format!("non-canonical-p case {i}")); +} + +#[test] +fn ext3_mul_u64_max_all_overflow_paths() { + // (u64::MAX, u64::MAX, u64::MAX) for both operands. u64::MAX = p + (2^32 - 2), + // i.e., a non-canonical representation of (2^32 - 2) mod p. Every dot3 + // product is ~2^128 - small, so summing three of them is the hardest + // possible exercise of `over1`, `over2`, and the EPSILON^2 correction + // path in `dot3`. + let m = [u64::MAX, u64::MAX, u64::MAX]; + assert_ext3_mul_pairs(&[(m, m)], |_| "u64::MAX^3 squared".into()); +} + +#[test] +fn ext3_mul_base_edge_pairs_embedded() { + // Embed every base-field edge value as the `a` component of an ext3 + // element (so b = c = 0) and run all NxN pairs through GPU mul. This + // reduces to base-field multiplication on the a-component but + // exercises all the dot3 zero/non-zero combinations. + let edges: Vec = vec![0, 1, P - 1, P, P + 1, u64::MAX, EPSILON]; + let mut pairs = Vec::with_capacity(edges.len() * edges.len()); + for x in &edges { + for y in &edges { + pairs.push(([*x, 0, 0], [*y, 0, 0])); + } + } + assert_ext3_mul_pairs(&pairs, |i| { + let xi = i / edges.len(); + let yi = i % edges.len(); + format!("base-edge a={:#x} b={:#x}", edges[xi], edges[yi]) + }); +} + +#[test] +fn ext3_mul_base_edges_in_b_and_c_slots() { + // Same edge values, but placed in the b and c slots so the cross terms + // (which involve the `b1_2 = 2*y.b`, `b2_2 = 2*y.c` doubling) are also + // exercised at edge inputs. Non-canonical doubling of P-1 etc. is a + // path that random tests rarely hit. + let edges: Vec = vec![0, 1, P - 1, P, P + 1, u64::MAX, EPSILON]; + let mut pairs = Vec::with_capacity(edges.len() * edges.len()); + for x in &edges { + for y in &edges { + // Put edge values in b/c slots, a non-trivial. + pairs.push(([1, *x, *y], [1, *x, *y])); + } + } + assert_ext3_mul_pairs(&pairs, |i| { + let xi = i / edges.len(); + let yi = i % edges.len(); + format!("ext3-bc-edge b={:#x} c={:#x}", edges[xi], edges[yi]) + }); +} diff --git a/crypto/math-cuda/tests/ext3_sub.rs b/crypto/math-cuda/tests/ext3_sub.rs new file mode 100644 index 000000000..3593379d1 --- /dev/null +++ b/crypto/math-cuda/tests/ext3_sub.rs @@ -0,0 +1,109 @@ +//! Parity test for `ext3::sub` (kernels/ext3.cuh:41-45). This device +//! function is part of the public ext3 header but is not invoked by any +//! kernel in the PR — every other ext3 caller uses `mul`, `add`, or +//! `mul_base`. The PR's review test infrastructure adds an +//! `ext3_sub_kernel` so we can call it directly here for parity vs the +//! CPU `Degree3GoldilocksExtensionField::sub`. + +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +const N: usize = 10_000; +const P: u64 = 0xFFFF_FFFF_0000_0001; + +fn random_fp3s(seed: u64, count: usize) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..count) + .map(|_| { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) + }) + .collect() +} + +fn to_u64s(col: &[Fp3]) -> Vec { + let mut v = Vec::with_capacity(col.len() * 3); + for e in col { + v.push(*e.value()[0].value()); + v.push(*e.value()[1].value()); + v.push(*e.value()[2].value()); + } + v +} + +fn canon(x: u64) -> u64 { + GoldilocksField::canonical(&x) +} + +#[test] +fn ext3_sub_matches_cpu_random() { + let a = random_fp3s(101, N); + let b = random_fp3s(202, N); + let a_raw = to_u64s(&a); + let b_raw = to_u64s(&b); + let gpu = math_cuda::ext3_sub_u64(&a_raw, &b_raw).expect("GPU ext3 sub launch"); + assert_eq!(gpu.len(), 3 * N); + for i in 0..N { + let cpu = Degree3GoldilocksExtensionField::sub(a[i].value(), b[i].value()); + let cpu_fp3 = Fp3::new(cpu); + let g = [ + canon(gpu[3 * i]), + canon(gpu[3 * i + 1]), + canon(gpu[3 * i + 2]), + ]; + let c = [ + canon(*cpu_fp3.value()[0].value()), + canon(*cpu_fp3.value()[1].value()), + canon(*cpu_fp3.value()[2].value()), + ]; + assert_eq!(g, c, "ext3 sub mismatch at {i}"); + } +} + +#[test] +fn ext3_sub_edge_cases() { + // Underflow cases: a < b on each component, plus non-canonical p + // representations. + let cases: Vec<([u64; 3], [u64; 3])> = vec![ + ([0, 0, 0], [P - 1, P - 1, P - 1]), + ([1, 2, 3], [P - 1, P - 1, P - 1]), + ([P - 1, P - 1, P - 1], [0, 0, 0]), + ([P, P, P], [P, P, P]), // (0,0,0) - (0,0,0) + ([u64::MAX, u64::MAX, u64::MAX], [0, 0, 0]), + ([0, 0, 0], [u64::MAX, u64::MAX, u64::MAX]), + ]; + let mut a_raw = Vec::new(); + let mut b_raw = Vec::new(); + for (a, b) in &cases { + a_raw.extend_from_slice(a); + b_raw.extend_from_slice(b); + } + let gpu = math_cuda::ext3_sub_u64(&a_raw, &b_raw).expect("GPU ext3 sub launch"); + for (i, (a, b)) in cases.iter().enumerate() { + let ae = [Fp::from_raw(a[0]), Fp::from_raw(a[1]), Fp::from_raw(a[2])]; + let be = [Fp::from_raw(b[0]), Fp::from_raw(b[1]), Fp::from_raw(b[2])]; + let cpu = Degree3GoldilocksExtensionField::sub(&ae, &be); + let cpu_fp3 = Fp3::new(cpu); + let g = [ + canon(gpu[3 * i]), + canon(gpu[3 * i + 1]), + canon(gpu[3 * i + 2]), + ]; + let c = [ + canon(*cpu_fp3.value()[0].value()), + canon(*cpu_fp3.value()[1].value()), + canon(*cpu_fp3.value()[2].value()), + ]; + assert_eq!(g, c, "ext3 sub edge mismatch at {i}: a={a:?} b={b:?}"); + } +} diff --git a/crypto/math-cuda/tests/lde_batch_into.rs b/crypto/math-cuda/tests/lde_batch_into.rs new file mode 100644 index 000000000..19411d152 --- /dev/null +++ b/crypto/math-cuda/tests/lde_batch_into.rs @@ -0,0 +1,87 @@ +//! Direct parity test for `coset_lde_batch_base_into` (lde.rs:331), the +//! caller-allocated-buffer variant of `coset_lde_batch_base`. The two should +//! produce bit-identical canonical output for the same inputs; the only +//! difference between them is who owns the output Vec. +//! +//! This is otherwise covered indirectly through `coset_lde_batch_base_into_with_leaf_hash` +//! and similar, but the base `_into` variant ships as public API with no +//! direct test in the original PR. + +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; + +fn coset_weights(n: usize, g: u64) -> Vec { + let inv_n = *Fp::from(n as u64).inv().unwrap().value(); + let mut w = Vec::with_capacity(n); + let mut cur = inv_n; + for _ in 0..n { + w.push(cur); + cur = GoldilocksField::mul(&cur, &g); + } + w +} + +fn canon(xs: &[u64]) -> Vec { + xs.iter().map(|x| GoldilocksField::canonical(x)).collect() +} + +fn run_pair(log_n: u64, blowup: usize, m: usize, seed: u64) { + let n = 1usize << log_n; + let lde_size = n * blowup; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let columns: Vec> = (0..m) + .map(|_| (0..n).map(|_| rng.r#gen::()).collect()) + .collect(); + + let weights = coset_weights(n, 7); + + let slices: Vec<&[u64]> = columns.iter().map(|c| c.as_slice()).collect(); + + // Reference: the existing batch-allocates-Vec API. Random-input tests + // already cross-validate this against the CPU single-column LDE in + // `lde_batch.rs`, so any divergence from it pinpoints `_into`. + let ref_out = math_cuda::lde::coset_lde_batch_base(&slices, blowup, &weights).unwrap(); + assert_eq!(ref_out.len(), m); + + // Caller-allocated buffers. + let mut owned: Vec> = (0..m).map(|_| vec![0u64; lde_size]).collect(); + { + let mut outs: Vec<&mut [u64]> = owned.iter_mut().map(|v| v.as_mut_slice()).collect(); + math_cuda::lde::coset_lde_batch_base_into(&slices, blowup, &weights, &mut outs) + .expect("into variant"); + } + + for c in 0..m { + assert_eq!( + canon(&owned[c]), + canon(&ref_out[c]), + "_into vs _batch_base diverge at column {c}, log_n={log_n}, blowup={blowup}, m={m}" + ); + } +} + +#[test] +fn into_matches_batch_base_small() { + run_pair(8, 4, 4, 1); + run_pair(10, 4, 1, 2); + run_pair(10, 4, 16, 3); +} + +#[test] +fn into_matches_batch_base_medium() { + run_pair(14, 4, 8, 4); + run_pair(15, 4, 32, 5); +} + +#[test] +fn into_matches_batch_base_uneven_blowup() { + // Non-default blowup factors (still power of two) — confirms the + // _into variant respects blowup_factor identically. + run_pair(8, 2, 4, 6); + run_pair(8, 8, 4, 7); +} diff --git a/crypto/math-cuda/tests/ntt_known.rs b/crypto/math-cuda/tests/ntt_known.rs new file mode 100644 index 000000000..b2cb2a162 --- /dev/null +++ b/crypto/math-cuda/tests/ntt_known.rs @@ -0,0 +1,125 @@ +//! Known-answer NTT test. Random-input tests catch most bugs but can mask +//! systematic errors (sign flips, off-by-one twiddle indices, wrong-direction +//! butterflies) that would cancel under noise. This test picks a polynomial +//! with a known closed-form evaluation at every root of unity and compares +//! the GPU forward NTT to that reference, computed independently from any +//! FFT code path. + +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsFFTField, IsPrimeField}; + +type Fp = FieldElement; + +fn canon(x: u64) -> u64 { + GoldilocksField::canonical(&x) +} + +/// p(x) = 1 + x. NTT at size N is the vector [p(omega^0), p(omega^1), ..., +/// p(omega^(N-1))] for `omega` a primitive N-th root of unity. We compute +/// the reference by direct exponentiation of `omega` — no FFT involved — +/// so a bug in either the CPU or GPU NTT can't hide here. +#[test] +fn ntt_known_polynomial_x_plus_one_size_256() { + let log_n: u64 = 8; + let n: usize = 1 << log_n; + + // GoldilocksField uses bit-reversed coefficient layout for forward NTT: + // input coeffs at index `i` become `p(omega^bitrev(i))` — confirm by + // matching against the existing forward() API which random-input tests + // already validate against `Polynomial::evaluate_fft`. The known-poly + // value lets us catch systematic errors in that pipeline that random + // inputs miss. + + // Input coefficients: [1, 1, 0, 0, ..., 0]. (Natural order, lowest + // degree first — `Polynomial::new` and `math_cuda::ntt::forward` both + // expect this convention.) + let mut input = vec![0u64; n]; + input[0] = 1; + input[1] = 1; + + let gpu = math_cuda::ntt::forward(&input).expect("gpu ntt"); + assert_eq!(gpu.len(), n); + + // Reference: omega = primitive N-th root of unity in Goldilocks. + // p(omega^i) = 1 + omega^i. + let omega = GoldilocksField::get_primitive_root_of_unity(log_n).expect("root of unity"); + let one = Fp::from_raw(1); + + let mut expected = Vec::with_capacity(n); + let mut omega_i = one.clone(); // omega^0 + for _ in 0..n { + let val = &one + &omega_i; + expected.push(*val.value()); + omega_i = &omega_i * ω + } + + for i in 0..n { + let g = canon(gpu[i]); + let e = canon(expected[i]); + if g != e { + panic!( + "p(omega^{i}) mismatch: gpu canon {:#018x}, expected canon {:#018x} (omega^{i} computed independently of any FFT)", + g, e, + ); + } + } +} + +/// Same idea, smaller: p(x) = 1 + x at size 2^4 = 16. A failure at this +/// size with passes at larger sizes (or vice-versa) would point at a +/// boundary bug between the recursive base case and the 8-level +/// shared-memory fused step in the GPU NTT. +#[test] +fn ntt_known_polynomial_x_plus_one_size_16() { + let log_n: u64 = 4; + let n: usize = 1 << log_n; + + let mut input = vec![0u64; n]; + input[0] = 1; + input[1] = 1; + + let gpu = math_cuda::ntt::forward(&input).expect("gpu ntt"); + + let omega = GoldilocksField::get_primitive_root_of_unity(log_n).expect("root"); + let one = Fp::from_raw(1); + let mut omega_i = one.clone(); + for i in 0..n { + let exp = &one + &omega_i; + assert_eq!( + canon(gpu[i]), + canon(*exp.value()), + "p(omega^{i}) mismatch at size 16" + ); + omega_i = &omega_i * ω + } +} + +/// p(x) = x^k for k = N/2. p(omega^i) = omega^(k*i). With k = N/2, +/// omega^(k*i) = (-1)^i since omega^(N/2) = -1 in any field with a +/// primitive N-th root of unity. So evaluations alternate +1, -1, +1, -1. +/// This is a strong test of twiddle-index direction and sign. +#[test] +fn ntt_known_polynomial_x_half_alternating() { + let log_n: u64 = 8; + let n: usize = 1 << log_n; + let k = n / 2; + + let mut input = vec![0u64; n]; + input[k] = 1; // p(x) = x^(N/2) + + let gpu = math_cuda::ntt::forward(&input).expect("gpu ntt"); + + // Expected: omega^(k*i) for i = 0..N, which is (omega^k)^i = (-1)^i. + // canonical(+1) = 1; canonical(-1) = p - 1. + let p_minus_one = 0xFFFF_FFFF_0000_0001u64 - 1; + for i in 0..n { + let exp = if i % 2 == 0 { 1u64 } else { p_minus_one }; + assert_eq!( + canon(gpu[i]), + exp, + "x^(N/2) NTT alternation mismatch at i={i}: got {:#018x}", + canon(gpu[i]) + ); + } +} From 8c529718ee6b8f0ae4287c3cab39b3dd552783dc Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Wed, 13 May 2026 11:59:28 -0300 Subject: [PATCH 7/7] address PR review feedback --- Makefile | 10 +- crypto/math-cuda/Cargo.toml | 3 +- crypto/math-cuda/build.rs | 57 +++++++-- crypto/math-cuda/kernels/ntt.cu | 2 +- crypto/math-cuda/src/device.rs | 29 ++++- crypto/math-cuda/src/lde.rs | 152 +++++++++-------------- crypto/math-cuda/src/lib.rs | 10 +- crypto/math-cuda/src/ntt.rs | 21 +++- crypto/math-cuda/tests/ext3_edge.rs | 2 +- crypto/math-cuda/tests/ext3_sub.rs | 4 +- crypto/math-cuda/tests/lde.rs | 4 +- crypto/math-cuda/tests/lde_batch_into.rs | 4 +- crypto/math-cuda/tests/ntt.rs | 2 - crypto/math-cuda/tests/ntt_known.rs | 26 ++-- 14 files changed, 184 insertions(+), 142 deletions(-) diff --git a/Makefile b/Makefile index c02bffc49..10be4f07a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: deps deps-linux deps-macos prepare-test-data compile-programs-asm compile-programs-rust compile-bench \ compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm test-no-compile \ test-asm-no-compile test-rust test-rust-no-compile test-executor flamegraph-prover \ -test-fast test-prover test-prover-all build check clippy fmt lint +test-fast test-prover test-prover-all test-math-cuda bench-math-cuda build check clippy fmt lint UNAME := $(shell uname) @@ -166,6 +166,14 @@ test-prover-all: test-prover-debug: cargo test -p lambda-vm-prover --features debug-checks -- --nocapture +# math-cuda parity tests (requires NVIDIA GPU + nvcc) +test-math-cuda: + cargo test -p math-cuda --release + +# math-cuda quick microbench (median of 10 runs) +bench-math-cuda: + cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture + # Build all build: cargo build --workspace diff --git a/crypto/math-cuda/Cargo.toml b/crypto/math-cuda/Cargo.toml index 8c22d1110..0990dd6d6 100644 --- a/crypto/math-cuda/Cargo.toml +++ b/crypto/math-cuda/Cargo.toml @@ -9,7 +9,8 @@ cudarc = { version = "0.19", default-features = false, features = [ "driver", "nvrtc", "std", - "cuda-12080", + "cuda-version-from-build-system", + "fallback-latest", "dynamic-loading", ] } math = { path = "../math" } diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs index 43edaa31c..cf541b5fd 100644 --- a/crypto/math-cuda/build.rs +++ b/crypto/math-cuda/build.rs @@ -14,6 +14,38 @@ fn nvcc_path() -> PathBuf { cuda_home().join("bin").join("nvcc") } +/// Query `nvidia-smi` for the local GPU's compute capability (e.g. "12.0" +/// for Blackwell). Returns a `compute_XX` target on success, falling back +/// to `compute_89` (Ada) when no GPU is visible or the query fails. +fn detect_arch() -> String { + const FALLBACK: &str = "compute_89"; + let output = match Command::new("nvidia-smi") + .args(["--query-gpu=compute_cap", "--format=csv,noheader"]) + .output() + { + Ok(o) if o.status.success() => o, + _ => return FALLBACK.to_string(), + }; + let line = match std::str::from_utf8(&output.stdout) { + Ok(s) => s, + Err(_) => return FALLBACK.to_string(), + }; + // First line, first comma-separated value (covers multi-GPU hosts). + let cap = match line.lines().next() { + Some(l) => l.split(',').next().unwrap_or("").trim(), + None => return FALLBACK.to_string(), + }; + let (major, minor) = match cap.split_once('.') { + Some((m, n)) => (m.trim(), n.trim()), + None => return FALLBACK.to_string(), + }; + if major.chars().all(|c| c.is_ascii_digit()) && minor.chars().all(|c| c.is_ascii_digit()) { + format!("compute_{major}{minor}") + } else { + FALLBACK.to_string() + } +} + fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) { let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); @@ -25,20 +57,22 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) { println!("cargo:rerun-if-env-changed=CUDA_PATH"); println!("cargo:rerun-if-env-changed=CUDARC_NVCC_ARCH"); - // No nvcc on PATH → emit an empty PTX stub so the crate still compiles. - // include_str! in src/device.rs needs the file to exist at build time. - // Any runtime kernel call will then panic from cudarc when loading the - // empty module — which is the right failure mode (we can't run GPU code - // without nvcc on the build host anyway). + // When nvcc is missing from PATH, emit an empty PTX stub so the crate + // still compiles. include_str! in src/device.rs needs the file to exist + // at build time. Any runtime kernel call panics in cudarc when loading + // the empty module. We can't run GPU code without nvcc on the build + // host anyway. if !have_nvcc { fs::write(&out_path, "").expect("failed to write empty PTX stub"); return; } // Emit PTX for a virtual architecture; the CUDA driver JIT-compiles it for the - // actual GPU at load time, so one PTX works across Ada/Hopper/Blackwell. Override - // with CUDARC_NVCC_ARCH to pin a specific compute capability. - let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| "compute_89".to_string()); + // actual GPU at load time. Override with CUDARC_NVCC_ARCH to pin a specific + // compute capability. If unset, try `nvidia-smi` to match the host GPU + // (avoids JIT failures like nvcc-13.0 PTX rejected on Blackwell drivers); + // fall back to compute_89 (Ada) when detection fails. + let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch()); let status = Command::new(nvcc_path()) .args(["--ptx", "-O3", "-std=c++17", "-arch", &arch, "-o"]) @@ -53,13 +87,14 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) { } fn main() { - // Headers are not compiled; emit rerun-if-changed so edits trigger rebuilds. + // Headers aren't compiled, so emit rerun-if-changed to rebuild on + // header edits. println!("cargo:rerun-if-changed=kernels/goldilocks.cuh"); println!("cargo:rerun-if-changed=kernels/ext3.cuh"); // Probe for nvcc once. Workspace consumers (clippy, fmt, CPU-only test - // runners) build math-cuda incidentally without using its kernels; allow - // those to succeed by stubbing out PTX when nvcc is unavailable. + // runners) build math-cuda incidentally without using its kernels. Stub + // out PTX when nvcc is unavailable so those builds succeed. let have_nvcc = Command::new(nvcc_path()) .arg("--version") .output() diff --git a/crypto/math-cuda/kernels/ntt.cu b/crypto/math-cuda/kernels/ntt.cu index a8e50d4e1..cf5e1df2c 100644 --- a/crypto/math-cuda/kernels/ntt.cu +++ b/crypto/math-cuda/kernels/ntt.cu @@ -218,7 +218,7 @@ extern "C" __global__ void ntt_dit_level(uint64_t *x, /// without writing to global memory between them — cuts DRAM traffic by up /// to 8× vs the per-level kernel. /// -/// `base_step` selects which 8-level window this launch handles (0, 8, 16…). +/// `base_step` selects which 8-level window this launch handles (0, 8, 16, ...). /// For levels 0–7 the implicit DIT element layout already places all pair /// mates inside the same 256-block; for higher base_step we remap the loaded /// row so pair mates land in consecutive shared-memory slots. diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index af5105ee5..b3c4a1c56 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -158,8 +158,9 @@ impl Backend { // not part of the pool that callers rotate through. let util_stream = ctx.new_stream()?; - // Goldilocks TWO_ADICITY is 32, so log_n ≤ 32 covers every LDE size - // the prover can produce. Overshoot by one for safety. + // Cache is indexed by log_n. Valid range is [0, TWO_ADICITY] since + // Goldilocks has roots of unity for orders 2^0..=2^TWO_ADICITY only. + // Length = TWO_ADICITY + 1 to allow indexing at log_n = TWO_ADICITY. let max_log = GoldilocksField::TWO_ADICITY as usize + 1; Ok(Self { @@ -219,6 +220,14 @@ impl Backend { } else { &self.inv_twiddles }; + // Cache is sized TWO_ADICITY + 1 in `Backend::init`. Callers derive + // log_n from `trailing_zeros` of valid Goldilocks domain sizes so it + // must stay in range; assert in debug to catch regressions. + debug_assert!( + log_n <= GoldilocksField::TWO_ADICITY, + "log_n {log_n} exceeds Goldilocks TWO_ADICITY ({})", + GoldilocksField::TWO_ADICITY, + ); { let guard = cache.lock().unwrap(); if let Some(t) = &guard[idx] { @@ -244,7 +253,19 @@ impl Backend { } } -pub fn backend() -> &'static Backend { +/// Returns the process-wide CUDA backend, initialising it on first call. +/// +/// Returns `Err` when CUDA initialisation fails (no driver, no GPU, PTX load +/// failure). Initialisation is retried on the next call until one succeeds — +/// only a successful `Backend` is cached. The race window where two threads +/// init concurrently is harmless: at most one extra `Backend::init()` runs +/// and the loser is dropped. +pub fn backend() -> Result<&'static Backend> { static BACKEND: OnceLock = OnceLock::new(); - BACKEND.get_or_init(|| Backend::init().expect("failed to initialise CUDA backend")) + if let Some(b) = BACKEND.get() { + return Ok(b); + } + let b = Backend::init()?; + let _ = BACKEND.set(b); + Ok(BACKEND.get().expect("backend just initialised")) } diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 5f1b95f34..719a50931 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -4,7 +4,7 @@ //! Input : N evaluations (natural order) of a poly on the standard subgroup, //! plus coset weights (size N). The weights include the `1/N` iFFT //! normalisation, matching the `LdeTwiddles::coset_weights` format at -//! `crypto/stark/src/prover.rs:248` — i.e. `weights[i] = g^i / N`. +//! `crypto/stark/src/prover.rs` — i.e. `weights[i] = g^i / N`. //! Output : N*blowup_factor evaluations (natural order) on the coset. //! //! On-device steps, picks a stream from the shared pool so rayon-parallel @@ -13,11 +13,24 @@ use std::sync::Arc; use cudarc::driver::{CudaSlice, LaunchConfig, PushKernelArg}; +use rayon::prelude::*; use crate::Result; use crate::device::backend; use crate::ntt::run_ntt_body; +/// Goldilocks `TWO_ADICITY = 32` puts the theoretical domain ceiling at +/// `2^32`, where a downstream `as u32` cast would silently truncate to zero +/// and the corresponding kernel launch would do nothing. Assert at each +/// public entry point before any cast that depends on it. +#[inline] +fn assert_u32_domain(n: usize, what: &str) { + assert!( + n <= u32::MAX as usize, + "{what}: {n} exceeds u32 range — kernel grid would silently truncate", + ); +} + /// Handle to a base-field LDE kept live on device after R1 commit. /// Layout: `m` columns, each `lde_size` u64s, column `c` at byte offset /// `c * lde_size * 8` within `buf`. Freed when `buf` Arc drops. @@ -40,20 +53,23 @@ pub struct GpuLdeExt3 { pub fn coset_lde_base(evals: &[u64], blowup_factor: usize, weights: &[u64]) -> Result> { let n = evals.len(); + // Empty input must short-circuit before the power-of-two assert + // (is_power_of_two returns false for 0). + if n == 0 { + return Ok(Vec::new()); + } assert!(n.is_power_of_two(), "evals length must be a power of two"); assert_eq!(weights.len(), n, "weights length must match evals"); assert!( blowup_factor.is_power_of_two(), "blowup must be power of two" ); - if n == 0 { - return Ok(Vec::new()); - } let lde_size = n * blowup_factor; + assert_u32_domain(lde_size, "coset_lde_base lde_size"); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; - let be = backend(); + let be = backend()?; let stream = be.next_stream(); // Device buffer of lde_size, zero-padded tail, first N filled by copy. @@ -127,6 +143,11 @@ pub fn coset_lde_batch_base( } let m = columns.len(); let n = columns[0].len(); + // Empty columns must short-circuit before the power-of-two assert + // (is_power_of_two returns false for 0). + if n == 0 { + return Ok(vec![Vec::new(); m]); + } assert!(n.is_power_of_two(), "column length must be a power of two"); assert_eq!(weights.len(), n, "weights length must match column length"); assert!( @@ -136,33 +157,15 @@ pub fn coset_lde_batch_base( for c in columns.iter() { assert_eq!(c.len(), n, "all columns must be the same size"); } - - if n == 0 { - return Ok(vec![Vec::new(); m]); - } let lde_size = n * blowup_factor; + assert_u32_domain(lde_size, "coset_lde_batch_base lde_size"); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let staging_slot = be.pinned_staging(); - let debug_phases = std::env::var("MATH_CUDA_PHASE_TIMING").is_ok(); - let t_start = if debug_phases { - Some(std::time::Instant::now()) - } else { - None - }; - let phase = |label: &str, prev: &mut Option| { - if let Some(p) = prev.as_ref() { - let now = std::time::Instant::now(); - eprintln!(" [{:>6.2} ms] {}", (now - *p).as_secs_f64() * 1e3, label); - *prev = Some(now); - } - }; - let mut last = t_start; - // Pinned staging. Lock and grow to max(m*n for upload, m*lde_size for // download). Holding the guard across the whole call serialises concurrent // batched calls that happened to hash to the same stream slot, but that's @@ -171,14 +174,10 @@ pub fn coset_lde_batch_base( staging.ensure_capacity(m * lde_size, &be.ctx)?; // SAFETY: staging is locked, the slice alias ends before we unlock. let pinned = unsafe { staging.as_mut_slice(m * lde_size) }; - if debug_phases { - phase("staging lock + grow", &mut last); - } // Pack columns into first m*n slots of the pinned buffer. Parallel: pinned - // writes are DRAM-bandwidth bound, saturates at ~8 cores on modern - // hardware, so rayon shaves 20+ ms at prover scale. - use rayon::prelude::*; + // writes are DRAM-bandwidth bound, so rayon spreads the cost across CPU + // cores. let pinned_base_ptr = pinned.as_mut_ptr() as usize; columns.par_iter().enumerate().for_each(|(c, col)| { // SAFETY: each task writes to a disjoint `[c*n..c*n+n]` region of @@ -188,35 +187,20 @@ pub fn coset_lde_batch_base( unsafe { std::slice::from_raw_parts_mut((pinned_base_ptr as *mut u64).add(c * n), n) }; dst.copy_from_slice(col); }); - if debug_phases { - phase("host pack (pinned, rayon)", &mut last); - } // Column layout: `buf[c * lde_size + r]`. Zeroed so the [n, lde_size) // tail of each column is already the zero-pad the CPU path does. let mut buf = stream.alloc_zeros::(m * lde_size)?; - if debug_phases { - stream.synchronize()?; - phase("alloc_zeros", &mut last); - } // One memcpy per column from the pinned buffer into the strided slots. // The pinned source hits PCIe line-rate. for c in 0..m { let mut dst = buf.slice_mut(c * lde_size..c * lde_size + n); stream.memcpy_htod(&pinned[c * n..c * n + n], &mut dst)?; } - if debug_phases { - stream.synchronize()?; - phase("H2D cols (pinned)", &mut last); - } let inv_tw = be.inv_twiddles_for(log_n)?; let fwd_tw = be.fwd_twiddles_for(log_lde)?; let weights_dev = stream.clone_htod(weights)?; - if debug_phases { - stream.synchronize()?; - phase("twiddles + weights", &mut last); - } let n_u64 = n as u64; let lde_u64 = lde_size as u64; @@ -242,10 +226,6 @@ pub fn coset_lde_batch_base( } } - if debug_phases { - stream.synchronize()?; - phase("bit_reverse N", &mut last); - } // === 2. iNTT body over all columns === run_batched_ntt_body( stream.as_ref(), @@ -256,10 +236,6 @@ pub fn coset_lde_batch_base( col_stride_u64, m_u32, )?; - if debug_phases { - stream.synchronize()?; - phase("iNTT body", &mut last); - } // === 3. Pointwise multiply by coset weights (includes 1/N) === { @@ -299,10 +275,6 @@ pub fn coset_lde_batch_base( } } - if debug_phases { - stream.synchronize()?; - phase("pointwise + bit_reverse LDE", &mut last); - } // === 5. Forward NTT on full LDE of every column === run_batched_ntt_body( stream.as_ref(), @@ -313,29 +285,22 @@ pub fn coset_lde_batch_base( col_stride_u64, m_u32, )?; - if debug_phases { - stream.synchronize()?; - phase("forward NTT body", &mut last); - } // Single big D2H into the reusable pinned staging buffer — pinned, one // call to the driver, saturates PCIe. stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?; stream.synchronize()?; - if debug_phases { - phase("D2H (one shot into pinned)", &mut last); - } // Split pinned → per-column Vecs. The first write to each virgin - // Vec page-faults, which can dominate total time (~75 ms for 128 MB). - // Parallelise so the fault cost spreads across CPU cores. + // Vec page-faults, which can dominate total time. Parallelise so the + // fault cost spreads across CPU cores. let pinned_ptr = pinned.as_ptr() as usize; let out: Vec> = (0..m) .into_par_iter() .map(|c| { - // set_len skips the O(N) zero-init that vec![0; n] would do - // (saves ~75 ms per 128 MB at prover scale). copy_from_slice - // below writes every slot before any reader sees the Vec. + // set_len skips the O(N) zero-init that vec![0; n] would do. + // copy_from_slice below writes every slot before any reader + // sees the Vec. #[allow(clippy::uninit_vec)] let mut v = { let mut v = Vec::::with_capacity(lde_size); @@ -349,18 +314,15 @@ pub fn coset_lde_batch_base( v }) .collect(); - if debug_phases { - phase("copy out (rayon pinned → Vecs)", &mut last); - } drop(staging); Ok(out) } /// Like `coset_lde_batch_base` but writes directly into caller-provided /// output slices instead of allocating fresh `Vec`s. Each output slice -/// must already have length `n * blowup_factor`. Saves ~50–100 ms of pageable -/// allocator work + page faults at prover scale because the caller's Vecs -/// have been sized once and are reused across calls. +/// must already have length `n * blowup_factor`. Avoids pageable allocator +/// work and page faults at prover scale because the caller's Vecs have been +/// sized once and are reused across calls. pub fn coset_lde_batch_base_into( columns: &[&[u64]], blowup_factor: usize, @@ -373,6 +335,11 @@ pub fn coset_lde_batch_base_into( let m = columns.len(); assert_eq!(outputs.len(), m, "outputs must match columns count"); let n = columns[0].len(); + // Empty columns must short-circuit before the power-of-two assert + // (is_power_of_two returns false for 0). + if n == 0 { + return Ok(()); + } assert!(n.is_power_of_two(), "column length must be a power of two"); assert_eq!(weights.len(), n, "weights length must match column length"); assert!( @@ -386,13 +353,11 @@ pub fn coset_lde_batch_base_into( for o in outputs.iter() { assert_eq!(o.len(), lde_size, "each output must be lde_size"); } - if n == 0 { - return Ok(()); - } + assert_u32_domain(lde_size, "coset_lde_batch_base_into lde_size"); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let staging_slot = be.pinned_staging(); @@ -496,7 +461,6 @@ pub fn coset_lde_batch_base_into( // Parallel copy pinned → caller outputs. Caller's Vecs may still fault // on first write; we spread that cost across rayon cores. #[allow(unused_imports)] - use rayon::prelude::*; let pinned_ptr = pinned.as_ptr() as usize; outputs.par_iter_mut().enumerate().for_each(|(c, dst)| { let src = unsafe { @@ -558,6 +522,11 @@ fn evaluate_poly_coset_batch_ext3_into_inner( } let m = coefs.len(); assert_eq!(outputs.len(), m); + // Empty domain must short-circuit before the power-of-two assert + // (is_power_of_two returns false for 0). + if n == 0 { + return Ok(None); + } assert!(n.is_power_of_two()); assert_eq!(weights.len(), n); assert!(blowup_factor.is_power_of_two()); @@ -568,13 +537,11 @@ fn evaluate_poly_coset_batch_ext3_into_inner( for o in outputs.iter() { assert_eq!(o.len(), 3 * lde_size); } - if n == 0 { - return Ok(None); - } + assert_u32_domain(lde_size, "evaluate_poly_coset_batch_ext3_into lde_size"); let log_lde = lde_size.trailing_zeros() as u64; let mb = 3 * m; - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let staging_slot = be.pinned_staging(); @@ -582,7 +549,6 @@ fn evaluate_poly_coset_batch_ext3_into_inner( staging.ensure_capacity(mb * lde_size, &be.ctx)?; let pinned = unsafe { staging.as_mut_slice(mb * lde_size) }; - use rayon::prelude::*; let pinned_ptr_u = pinned.as_mut_ptr() as usize; coefs.par_iter().enumerate().for_each(|(c, col)| { let slab_a = unsafe { @@ -736,6 +702,11 @@ pub fn coset_lde_batch_ext3_into( } let m = columns.len(); assert_eq!(outputs.len(), m, "outputs must match columns count"); + // Empty domain must short-circuit before the power-of-two assert + // (is_power_of_two returns false for 0). + if n == 0 { + return Ok(()); + } assert!(n.is_power_of_two(), "n must be a power of two"); assert_eq!(weights.len(), n, "weights length must match n"); assert!( @@ -749,16 +720,14 @@ pub fn coset_lde_batch_ext3_into( for o in outputs.iter() { assert_eq!(o.len(), 3 * lde_size, "each output must be 3*lde_size u64s"); } - if n == 0 { - return Ok(()); - } + assert_u32_domain(lde_size, "coset_lde_batch_ext3_into lde_size"); let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; // 3 base slabs per ext3 column; slab index `c*3 + k` holds component `k`. let mb = 3 * m; - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let staging_slot = be.pinned_staging(); @@ -769,7 +738,6 @@ pub fn coset_lde_batch_ext3_into( // Pack: for each ext3 column, write 3 base slabs into pinned. The slab // for column c, component k lives at `pinned[(c*3 + k)*n .. (c*3+k)*n + n]`. // We de-interleave from the interleaved `[a, b, c, a, b, c, ...]` input. - use rayon::prelude::*; let pinned_ptr_u = pinned.as_mut_ptr() as usize; columns.par_iter().enumerate().for_each(|(c, col)| { // SAFETY: disjoint regions per c; staging lock held. @@ -925,7 +893,7 @@ fn run_batched_ntt_body( col_stride: u64, m: u32, ) -> Result<()> { - let be = backend(); + let be = backend()?; let fused = core::cmp::min(log_n, 8); if fused >= 8 { let grid_x = (n / 256) as u32; diff --git a/crypto/math-cuda/src/lib.rs b/crypto/math-cuda/src/lib.rs index 04ec08b1d..7731fe4c3 100644 --- a/crypto/math-cuda/src/lib.rs +++ b/crypto/math-cuda/src/lib.rs @@ -37,7 +37,7 @@ pub fn gl_neg_u64(a: &[u64]) -> Result> { if n == 0 { return Ok(Vec::new()); } - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let a_dev = stream.clone_htod(a)?; @@ -68,7 +68,7 @@ pub fn ext3_mul_u64(a: &[u64], b: &[u64]) -> Result> { if n == 0 { return Ok(Vec::new()); } - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let a_dev = stream.clone_htod(a)?; let b_dev = stream.clone_htod(b)?; @@ -97,7 +97,7 @@ pub fn ext3_sub_u64(a: &[u64], b: &[u64]) -> Result> { if n == 0 { return Ok(Vec::new()); } - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let a_dev = stream.clone_htod(a)?; let b_dev = stream.clone_htod(b)?; @@ -126,7 +126,7 @@ pub fn ext3_add_u64(a: &[u64], b: &[u64]) -> Result> { if n == 0 { return Ok(Vec::new()); } - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let a_dev = stream.clone_htod(a)?; let b_dev = stream.clone_htod(b)?; @@ -156,7 +156,7 @@ where if n == 0 { return Ok(Vec::new()); } - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let a_dev = stream.clone_htod(a)?; diff --git a/crypto/math-cuda/src/ntt.rs b/crypto/math-cuda/src/ntt.rs index 31333c3ae..f5accc3b1 100644 --- a/crypto/math-cuda/src/ntt.rs +++ b/crypto/math-cuda/src/ntt.rs @@ -14,10 +14,13 @@ use math::field::traits::{IsFFTField, IsField}; use crate::Result; use crate::device::backend; -/// Host-side twiddle table: `[ω^0, ω^1, …, ω^{n/2-1}]` where ω is the +/// Host-side twiddle table: `[ω^0, ω^1, ..., ω^{n/2-1}]` where ω is the /// primitive n-th root of unity. Exposed for `device::Backend::cached_twiddles` /// and for direct use in tests / benches. pub fn twiddles_forward(log_n: u64) -> Vec { + // Smallest meaningful NTT is size 2 (log_n = 1); size-1 has nothing to + // twiddle. The shift `1 << (log_n - 1)` underflows for log_n = 0. + assert!(log_n >= 1, "twiddles_forward: log_n must be >= 1"); let omega = *GoldilocksField::get_primitive_root_of_unity(log_n) .expect("primitive root") .value(); @@ -26,6 +29,7 @@ pub fn twiddles_forward(log_n: u64) -> Vec { /// Inverse twiddle table: `[ω^{-i}]` for i in [0, n/2). pub fn twiddles_inverse(log_n: u64) -> Vec { + assert!(log_n >= 1, "twiddles_inverse: log_n must be >= 1"); let omega = GoldilocksField::get_primitive_root_of_unity(log_n).expect("primitive root"); let omega_inv = FieldElement::::inv(&omega).expect("inverse"); powers_of(*omega_inv.value(), 1usize << (log_n - 1)) @@ -56,13 +60,20 @@ pub fn inverse(evals: &[u64]) -> Result> { fn ntt_inplace(input: &[u64], forward: bool) -> Result> { let n = input.len(); - assert!(n.is_power_of_two(), "ntt length must be a power of two"); + // Empty / size-1 has no work to do. `is_power_of_two()` returns false for + // 0, so this branch must come before the assert to avoid panicking on + // empty input. if n <= 1 { return Ok(input.to_vec()); } + assert!(n.is_power_of_two(), "ntt length must be a power of two"); + assert!( + n <= u32::MAX as usize, + "ntt length {n} exceeds u32 range — kernel grid would silently truncate", + ); let log_n = n.trailing_zeros() as u64; - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let mut x_dev = stream.clone_htod(input)?; @@ -117,7 +128,7 @@ pub(crate) fn run_ntt_body( n: u64, log_n: u64, ) -> Result<()> { - let be = backend(); + let be = backend()?; // Levels 0..min(log_n, 8): one shmem-fused launch. Loads are fully // coalesced (base_step=0 → `row = tid`) and 8 butterfly rounds stay on // chip. This is the big DRAM-bandwidth win. @@ -183,7 +194,7 @@ pub fn pointwise_mul(x: &[u64], w: &[u64]) -> Result> { if n == 0 { return Ok(Vec::new()); } - let be = backend(); + let be = backend()?; let stream = be.next_stream(); let mut x_dev = stream.clone_htod(x)?; diff --git a/crypto/math-cuda/tests/ext3_edge.rs b/crypto/math-cuda/tests/ext3_edge.rs index 55b20bc92..f298fe884 100644 --- a/crypto/math-cuda/tests/ext3_edge.rs +++ b/crypto/math-cuda/tests/ext3_edge.rs @@ -1,6 +1,6 @@ //! Adversarial edge-case parity tests for `ext3::mul` on the GPU. //! -//! The CUDA `dot3` (kernels/ext3.cuh:62-98) manually tracks overflow when +//! The CUDA `dot3` in kernels/ext3.cuh manually tracks overflow when //! summing three u128 products in split u64 hi/lo registers. The CPU //! reference (`crypto/math/src/field/goldilocks.rs::dot_product_3` via //! `Degree3GoldilocksExtensionField::mul`) uses native u128 and so reaches diff --git a/crypto/math-cuda/tests/ext3_sub.rs b/crypto/math-cuda/tests/ext3_sub.rs index 3593379d1..1e94682a0 100644 --- a/crypto/math-cuda/tests/ext3_sub.rs +++ b/crypto/math-cuda/tests/ext3_sub.rs @@ -1,4 +1,4 @@ -//! Parity test for `ext3::sub` (kernels/ext3.cuh:41-45). This device +//! Parity test for `ext3::sub` in kernels/ext3.cuh. This device //! function is part of the public ext3 header but is not invoked by any //! kernel in the PR — every other ext3 caller uses `mul`, `add`, or //! `mul_base`. The PR's review test infrastructure adds an @@ -78,7 +78,7 @@ fn ext3_sub_edge_cases() { ([0, 0, 0], [P - 1, P - 1, P - 1]), ([1, 2, 3], [P - 1, P - 1, P - 1]), ([P - 1, P - 1, P - 1], [0, 0, 0]), - ([P, P, P], [P, P, P]), // (0,0,0) - (0,0,0) + ([P, P, P], [P, P, P]), // (0,0,0) - (0,0,0) ([u64::MAX, u64::MAX, u64::MAX], [0, 0, 0]), ([0, 0, 0], [u64::MAX, u64::MAX, u64::MAX]), ]; diff --git a/crypto/math-cuda/tests/lde.rs b/crypto/math-cuda/tests/lde.rs index 33f98f9ae..110997e6e 100644 --- a/crypto/math-cuda/tests/lde.rs +++ b/crypto/math-cuda/tests/lde.rs @@ -12,8 +12,8 @@ use rand_chacha::ChaCha8Rng; type Fp = FieldElement; -/// Build the coset weights `[1/N, g/N, g²/N, …, g^{n-1}/N]` — this is the -/// layout `crypto/stark/src/prover.rs:248` uses, with `1/N` pre-folded into the +/// Build the coset weights `[1/N, g/N, g²/N, ..., g^{n-1}/N]` — this is the +/// layout `crypto/stark/src/prover.rs` uses, with `1/N` pre-folded into the /// first coefficient so the iFFT step does not need a separate scaling pass. fn coset_weights(n: usize, coset_offset: u64) -> Vec { let inv_n_fe = FieldElement::::from(n as u64) diff --git a/crypto/math-cuda/tests/lde_batch_into.rs b/crypto/math-cuda/tests/lde_batch_into.rs index 19411d152..c3d25adbf 100644 --- a/crypto/math-cuda/tests/lde_batch_into.rs +++ b/crypto/math-cuda/tests/lde_batch_into.rs @@ -1,4 +1,4 @@ -//! Direct parity test for `coset_lde_batch_base_into` (lde.rs:331), the +//! Direct parity test for `coset_lde_batch_base_into` (lde.rs), the //! caller-allocated-buffer variant of `coset_lde_batch_base`. The two should //! produce bit-identical canonical output for the same inputs; the only //! difference between them is who owns the output Vec. @@ -27,7 +27,7 @@ fn coset_weights(n: usize, g: u64) -> Vec { } fn canon(xs: &[u64]) -> Vec { - xs.iter().map(|x| GoldilocksField::canonical(x)).collect() + xs.iter().map(GoldilocksField::canonical).collect() } fn run_pair(log_n: u64, blowup: usize, m: usize, seed: u64) { diff --git a/crypto/math-cuda/tests/ntt.rs b/crypto/math-cuda/tests/ntt.rs index 17a556c74..c02892204 100644 --- a/crypto/math-cuda/tests/ntt.rs +++ b/crypto/math-cuda/tests/ntt.rs @@ -61,13 +61,11 @@ fn ntt_sizes_medium() { #[test] fn ntt_size_2_to_20() { - // The hot LDE size. One seed is enough; any mismatch screams loudly. assert_ntt_match(20, 0xDEAD); } #[test] fn ntt_trivial_sizes() { - // Power-of-two below the interesting range — should still pass. assert_ntt_match(1, 1); assert_ntt_match(2, 2); assert_ntt_match(3, 3); diff --git a/crypto/math-cuda/tests/ntt_known.rs b/crypto/math-cuda/tests/ntt_known.rs index b2cb2a162..f0a7b9f5c 100644 --- a/crypto/math-cuda/tests/ntt_known.rs +++ b/crypto/math-cuda/tests/ntt_known.rs @@ -17,7 +17,7 @@ fn canon(x: u64) -> u64 { /// p(x) = 1 + x. NTT at size N is the vector [p(omega^0), p(omega^1), ..., /// p(omega^(N-1))] for `omega` a primitive N-th root of unity. We compute -/// the reference by direct exponentiation of `omega` — no FFT involved — +/// the reference by direct exponentiation of `omega` (no FFT involved) /// so a bug in either the CPU or GPU NTT can't hide here. #[test] fn ntt_known_polynomial_x_plus_one_size_256() { @@ -25,14 +25,14 @@ fn ntt_known_polynomial_x_plus_one_size_256() { let n: usize = 1 << log_n; // GoldilocksField uses bit-reversed coefficient layout for forward NTT: - // input coeffs at index `i` become `p(omega^bitrev(i))` — confirm by + // input coeffs at index `i` become `p(omega^bitrev(i))`. Confirm by // matching against the existing forward() API which random-input tests // already validate against `Polynomial::evaluate_fft`. The known-poly // value lets us catch systematic errors in that pipeline that random // inputs miss. // Input coefficients: [1, 1, 0, 0, ..., 0]. (Natural order, lowest - // degree first — `Polynomial::new` and `math_cuda::ntt::forward` both + // degree first. `Polynomial::new` and `math_cuda::ntt::forward` both // expect this convention.) let mut input = vec![0u64; n]; input[0] = 1; @@ -47,16 +47,16 @@ fn ntt_known_polynomial_x_plus_one_size_256() { let one = Fp::from_raw(1); let mut expected = Vec::with_capacity(n); - let mut omega_i = one.clone(); // omega^0 + let mut omega_i = one; // omega^0 for _ in 0..n { let val = &one + &omega_i; expected.push(*val.value()); omega_i = &omega_i * ω } - for i in 0..n { - let g = canon(gpu[i]); - let e = canon(expected[i]); + for (i, (&g_raw, &e_raw)) in gpu.iter().zip(expected.iter()).enumerate() { + let g = canon(g_raw); + let e = canon(e_raw); if g != e { panic!( "p(omega^{i}) mismatch: gpu canon {:#018x}, expected canon {:#018x} (omega^{i} computed independently of any FFT)", @@ -83,11 +83,11 @@ fn ntt_known_polynomial_x_plus_one_size_16() { let omega = GoldilocksField::get_primitive_root_of_unity(log_n).expect("root"); let one = Fp::from_raw(1); - let mut omega_i = one.clone(); - for i in 0..n { + let mut omega_i = one; + for (i, &g) in gpu.iter().enumerate() { let exp = &one + &omega_i; assert_eq!( - canon(gpu[i]), + canon(g), canon(*exp.value()), "p(omega^{i}) mismatch at size 16" ); @@ -113,13 +113,13 @@ fn ntt_known_polynomial_x_half_alternating() { // Expected: omega^(k*i) for i = 0..N, which is (omega^k)^i = (-1)^i. // canonical(+1) = 1; canonical(-1) = p - 1. let p_minus_one = 0xFFFF_FFFF_0000_0001u64 - 1; - for i in 0..n { + for (i, &g) in gpu.iter().enumerate() { let exp = if i % 2 == 0 { 1u64 } else { p_minus_one }; assert_eq!( - canon(gpu[i]), + canon(g), exp, "x^(N/2) NTT alternation mismatch at i={i}: got {:#018x}", - canon(gpu[i]) + canon(g) ); } }