yetanotherco · MauroToscano · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,8 @@ executor/program_artifacts/
 # Shared cargo target directory for ELF builds
 executor/shared_target/
 
+
+# Experiment artifacts — never commit
+artifacts/
+profiles/
+*.bundle
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
   "crypto/stark",
   "crypto/crypto",
   "crypto/math",
+  "crypto/math-cuda",
   "bin/cli",
 ]
 

diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 .PHONY: deps deps-linux deps-macos prepare-test-data compile-programs-asm compile-programs-rust compile-bench \
 compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm test-no-compile \
 test-asm-no-compile test-rust test-rust-no-compile test-executor flamegraph-prover \
-test-fast test-prover test-prover-all build check clippy fmt lint
+test-fast test-prover test-prover-all build check clippy fmt lint test-cuda check-cuda
 
 UNAME := $(shell uname)
 
@@ -193,3 +193,17 @@ lint:
 
 flamegraph-prover:
 	cd crypto/stark && samply record cargo bench --bench profile_prover --features parallel
+
+# === CUDA ===
+# Run math-cuda tests (requires CUDA + a visible GPU).
+test-cuda:
+	cargo test -p math-cuda
+
+check-cuda:
+	cargo check -p math-cuda
+	cargo check -p stark --features cuda
+	cargo check -p lambda-vm-prover --features cuda
+
+# Fast test suite with GPU LDE enabled (drop-in replacement for `test-fast`).
+test-fast-cuda:
+	cargo test -p lambda-vm-prover -p stark -p executor -F stark/parallel,stark/cuda
diff --git a/README.md b/README.md
@@ -177,6 +177,28 @@ cargo test --release -p lambda-vm-prover --features debug-checks -- --nocapture
 
 The feature is defined in `crypto/stark/Cargo.toml` and forwarded through `prover/Cargo.toml`. It has zero overhead when disabled.
 
+## GPU acceleration (experimental)
+
+A CUDA backend for the per-column coset LDE (the `coset_lde_full_expand` hot path) lives in the `math-cuda` crate and is gated behind the `cuda` feature on `stark` / `lambda-vm-prover`. Requires CUDA 13.x with a visible NVIDIA GPU. Covers the Goldilocks base field only; extension-field columns and small LDEs transparently fall back to the CPU path.
+
+```sh
+# Unit tests for the GPU kernels (parity against CPU, sizes up to 2^20):
+make test-cuda
+
+# Full workspace check including the CUDA feature:
+make check-cuda
+
+# `test-fast` with GPU LDE enabled:
+make test-fast-cuda
+```
+
+Behaviour:
+- The GPU path fires only when `buffer.len() * blowup_factor >= 2^19` and the column is `FieldElement<GoldilocksField>`. Tune with `LAMBDA_VM_GPU_LDE_THRESHOLD=<n>` at runtime.
+- If the `cuda` feature is enabled and CUDA initialisation fails, the process panics with a clear message — there is no transparent fallback to CPU.
+- The CPU-only build (default) is bit-for-bit identical to before; the feature is zero overhead when disabled.
+
+Status: on a single RTX 5090 with ~46 CPU cores and the current kernel set, end-to-end prove time ties the rayon-parallel CPU path on 1M–4M-instruction proofs. Wins on single-column LDE are ~16× at 2^18 sizes but are swallowed by CPU parallelism and per-call kernel launch overhead. Next steps for a real speedup are kernel fusion across NTT levels, CUDA graphs to amortise launch, keeping LDE on device through Merkle, and moving Keccak/constraint evaluation to GPU.
+
 ## Roadmap for the virtual machine
 
 This project is under active development. Our primary objective is to have a first working version for the virtual machine. Priorities and features might change as we continue developing.

diff --git a/bin/cli/Cargo.toml b/bin/cli/Cargo.toml
@@ -15,3 +15,4 @@ tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true }
 [features]
 jemalloc-stats = ["dep:tikv-jemalloc-ctl"]
 instruments = ["prover/instruments", "stark/instruments"]
+cuda = ["prover/cuda"]
diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs
@@ -54,6 +54,30 @@ where
         Self::build_from_hashed_leaves(hashed_leaves)
     }
 
+    /// Build a `MerkleTree` from an already-filled node vector whose layout
+    /// matches [`build_from_hashed_leaves`] output:
+    ///
+    ///   - `nodes.len() == 2 * leaves_len - 1` where `leaves_len` is a power of two
+    ///   - `nodes[0]` is the root
+    ///   - `nodes[leaves_len - 1 .. 2*leaves_len - 1]` are the leaves
+    ///
+    /// Useful when the tree was constructed elsewhere (e.g. on a GPU) and
+    /// the caller just wants to hand the finished layout to the stark prover.
+    /// Performs no hashing.
+    pub fn from_precomputed_nodes(nodes: Vec<B::Node>) -> Option<Self> {
+        if nodes.is_empty() {
+            return None;
+        }
+        // Validate (cheap) that (nodes.len() + 1) is a power of two: there
+        // must be `leaves_len - 1 + leaves_len = 2*leaves_len - 1` entries.
+        let total = nodes.len();
+        if !(total + 1).is_power_of_two() {
+            return None;
+        }
+        let root = nodes[ROOT].clone();
+        Some(MerkleTree { root, nodes })
+    }
+
     /// Create a Merkle tree from pre-hashed leaf nodes.
     ///
     /// This skips the `hash_leaves` step, useful when leaves have already been

diff --git a/crypto/math-cuda/Cargo.toml b/crypto/math-cuda/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "math-cuda"
+description = "CUDA-accelerated FFT/NTT for Goldilocks (base field) used by the lambda-vm STARK prover"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cudarc = { version = "0.19", default-features = false, features = [
+    "driver",
+    "nvrtc",
+    "std",
+    "cuda-12080",
+    "dynamic-loading",
+] }
+math = { path = "../math" }
+rayon = "1.7"
+
+[dev-dependencies]
+rand = { version = "0.8.5", features = ["std"] }
+rand_chacha = "0.3.1"
+rayon = "1.7"
+sha3 = "0.10.8"
diff --git a/crypto/math-cuda/DESIGN_EXP11.md b/crypto/math-cuda/DESIGN_EXP11.md
@@ -0,0 +1,149 @@
+# Design: device-resident main trace (exp-11)
+
+Tracking the biggest remaining single win — eliminate redundant
+main-trace host→device copies. Not yet implemented; this doc scopes
+the work. Matches the pattern of exp-4 (tier-3 analysis), which
+shipped as a checkpoint so the plan was preserved across context
+windows.
+
+## Current state
+
+Fib_1M wall-time breakdown at exp-9 tip (15-trial mean 10.96 s):
+
+```
+Trace build             2.48 s   21.9%    CPU (user-supervised)
+Round 1 Phase A         ~1.5 s   13%      Main commits (LDE+Merkle on GPU)
+Round 1 Phase B         ~0 s              LogUp challenges
+Round 1 Pass 1          ~2.0 s   18%      Aux-trace build (LogUp GPU, exp-9)
+Round 1 Pass 2          ~1.0 s    9%      Aux commits (ext3 LDE+Merkle)
+Rounds 2–4              ~4.0 s   36%
+```
+
+Two places currently H2D the same main-trace data per proof:
+
+1. **Phase A** — `coset_lde_batch_base_into_with_merkle_tree_inner`
+   copies each column (total ~240 MB/table) from pinned staging into
+   a device buffer of size `m * lde_size`, then overwrites in place
+   with the iNTT result. The pre-LDE main trace is on device for
+   a few microseconds before the iNTT kernel starts.
+
+2. **Pass 1** — `logup_gpu::try_compute_table_term_columns` calls
+   `upload_main_cols` which does the exact same H2D again. Total
+   wall cost per-table is ~20–40 ms on a 32 GB/s PCIe link; exp-9
+   serializes them so the total is ~200–300 ms wall on fib_1M.
+
+Both uploads carry identical bytes (the table's main columns); the
+second one is pure waste.
+
+## The fix in two steps
+
+### Step 1 — preserve pre-LDE columns in the fused LDE kernel
+
+Modify `coset_lde_batch_base_into_with_merkle_tree_inner` to
+optionally preserve the uploaded trace before iNTT. In the current
+code, after line 769 (`memcpy_htod` loop) the first `n` u64s of each
+column-slab hold the trace. A device-to-device copy to a fresh
+`m*n` buffer just before the iNTT kernel is basically free (VRAM
+bandwidth ≈ 1 TB/s; 240 MB copy takes <0.3 ms).
+
+Signature sketch:
+
+```rust
+pub fn coset_lde_batch_base_into_with_merkle_tree_keep_main(
+    columns: &[&[u64]],
+    blowup_factor: usize,
+    weights: &[u64],
+    outputs: &mut [&mut [u64]],
+    merkle_nodes_out: &mut [u8],
+) -> Result<(GpuLdeBase, Arc<logup::DeviceMainCols>)>
+```
+
+The returned `DeviceMainCols` owns a `CudaSlice<u64>` sized `m * n`
+in column-major order — directly what
+`logup::logup_pair_term_column_on_device` already expects.
+
+### Step 2 — thread the handle to aux-build
+
+`MainTraceCommitResult` already holds an optional `GpuLdeBase`
+(`gpu_main` field, line 172 of prover.rs). Add a sibling
+`gpu_main_pre_lde: Option<Arc<DeviceMainCols>>`. Prover's `multi_prove`
+already stashes the main LDE handle per-table; reuse the same
+lookup pattern for `gpu_main_pre_lde`.
+
+Aux-build currently receives `&mut TraceTable` + `&[challenges]`. To
+reach the per-table handle without changing trait signatures, add a
+module-level `RwLock<HashMap<usize, Arc<DeviceMainCols>>>` in
+`logup_gpu.rs` keyed by `trace as *const _ as usize`. Prover
+populates after Phase A completes; aux-build consults; prover
+clears after Pass 1.
+
+```rust
+// in logup_gpu.rs
+static PRE_LDE_CACHE: RwLock<HashMap<usize, Arc<DeviceMainCols>>> =
+    RwLock::new(HashMap::new());
+
+pub fn store_pre_lde_main(trace_ptr: usize, handle: Arc<DeviceMainCols>);
+pub fn take_pre_lde_main(trace_ptr: usize) -> Option<Arc<DeviceMainCols>>;
+pub fn clear_pre_lde_cache();
+```
+
+Inside `try_compute_table_term_columns`, skip `upload_main_cols` if
+the cache has a handle for this trace pointer; drop back to the
+existing H2D path otherwise (keeps the function correct for tables
+that went through the non-GPU Phase A path).
+
+## Expected win
+
+- Per-table H2D saved: ~20–40 ms
+- Total saved on fib_1M (12 tables × exp-9 serialized): 200–300 ms wall
+- Aux-trace-build wall is currently ~2.0 s, so this lands it at ~1.7 s
+- Total fib_1M projected: ~10.6 s (vs 10.96 s today)
+
+At larger sizes the gain scales:
+- fib_4M: estimated 600–800 ms saved (same number of tables but more
+  rows, so each H2D is bigger and takes longer absolutely)
+
+## Risks / gotchas
+
+- **CudaSlice Send/Sync.** `DeviceMainCols` must be `Send + Sync` to
+  live in an `Arc` across rayon threads. cudarc 0.19 documents
+  `CudaSlice<T>: Send + Sync where T: Send + Sync`, so u64 works.
+  Verify at the compile-error level, don't trust docs.
+- **Cache key stability.** `trace as *const _ as usize` only works
+  while the `TraceTable` isn't moved. In Pass 1 the trace is behind
+  `&mut` and never reallocates, so the key is stable — but if anyone
+  later refactors the aux-build loop to move traces, the cache will
+  silently miss or (worse) hit a stale entry. Add a debug-assert on
+  length in `try_compute_table_term_columns` matching the cache's
+  stored `n`.
+- **Cache lifetime.** The cache must be cleared at the start of each
+  prove so stale handles don't leak into the next proof. Simplest
+  location: `multi_prove` preamble. Alternative: a drop guard tied
+  to the outermost prover scope.
+- **Phase-A CPU fallback.** When Phase A falls back to the CPU LDE
+  path (trace below the GPU threshold), no handle is produced and
+  aux-build correctly falls back to its existing H2D path. No
+  special-casing required.
+- **Memory pressure on 32 GB VRAM.** Each pre-LDE buffer is
+  `num_cols * n * 8` bytes. For fib_4M's biggest table (MEMW_R ×
+  3.1M rows × ~30 cols = 750 MB) multiplied by 3 MEMW_R instances =
+  2.25 GB. Plus LDE buffers (4× larger), that's ~11 GB — still fits
+  comfortably on an RTX 5090. If future work increases table count,
+  consider a drop-when-aux-build-finishes policy rather than
+  holding through Round 4.
+
+## Why this ships as a design, not code
+
+The plumbing touches:
+- `crypto/math-cuda/src/lde.rs` (new fused-path variant)
+- `crypto/math-cuda/src/logup.rs` (cache accessors)
+- `crypto/stark/src/gpu_lde.rs` (wire through the keep variant)
+- `crypto/stark/src/prover.rs` (populate cache, clear at prove start)
+- `crypto/stark/src/logup_gpu.rs` (consult cache, fall back)
+
+~600–900 lines. Doable in a focused day, but not within the time
+budget of the current session. Checkpointing the plan so the next
+pass can execute cleanly.
+
+Estimated effort: one focused work session plus a parity + bench
+run. Expected landing: fib_1M ~10.6 s, fib_4M ~32 s → ~30 s.