From cca0a2a2aec2df23aff1016828d5f0bece39e7ad Mon Sep 17 00:00:00 2001
From: Alexis Duburcq <alexis.duburcq@gmail.com>
Date: Tue, 21 Apr 2026 00:07:43 +0200
Subject: [PATCH] [AutoDiff] Surface LLVM adstack push/pop overflow as a Python
 exception

---
 quadrants/codegen/llvm/codegen_llvm.cpp       |   2 +-
 quadrants/program/program.cpp                 |   5 +
 quadrants/program/program_impl.h              |   6 +
 .../runtime/llvm/llvm_runtime_executor.cpp    |  19 +++
 .../runtime/llvm/llvm_runtime_executor.h      |  11 ++
 .../llvm/runtime_module/internal_functions.h  |  51 +++++-
 .../runtime/llvm/runtime_module/runtime.cpp   |  43 ++++-
 .../runtime/program_impls/llvm/llvm_program.h |  19 +++
 tests/python/test_adstack.py                  | 158 ++++++++++++++++++
 9 files changed, 304 insertions(+), 10 deletions(-)
diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp
index 1905f33531..35156d8b6a 100644
--- a/quadrants/codegen/llvm/codegen_llvm.cpp
+++ b/quadrants/codegen/llvm/codegen_llvm.cpp
@@ -2117,7 +2117,7 @@ void TaskCodeGenLLVM::visit(AdStackPopStmt *stmt) {
 
 void TaskCodeGenLLVM::visit(AdStackPushStmt *stmt) {
   auto stack = stmt->stack->as<AdStackAllocaStmt>();
-  call("stack_push", llvm_val[stack], tlctx->get_constant(stack->max_size),
+  call("stack_push", get_runtime(), llvm_val[stack], tlctx->get_constant(stack->max_size),
        tlctx->get_constant(stack->element_size_in_bytes()));
   auto primal_ptr = call("stack_top_primal", llvm_val[stack], tlctx->get_constant(stack->element_size_in_bytes()));
   primal_ptr = builder->CreateBitCast(primal_ptr, llvm::PointerType::get(tlctx->get_data_type(stmt->ret_type), 0));
diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 64c8a08067..06ec1a1d8e 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -307,6 +307,11 @@ void Program::finalize() {
     return;
   }
 
+  // Notify the backend that teardown has started before the two teardown syncs below. On LLVM this flips
+  // `LlvmProgramImpl::finalizing_` so `check_adstack_overflow()` short-circuits: otherwise a pending overflow
+  // flag from a kernel the user never synced explicitly would throw into the Program destructor path.
+  program_impl_->pre_finalize();
+
   synchronize();
   QD_TRACE("Program finalizing...");
 
diff --git a/quadrants/program/program_impl.h b/quadrants/program/program_impl.h
index 8a43c62eb7..bafbcfdc1d 100644
--- a/quadrants/program/program_impl.h
+++ b/quadrants/program/program_impl.h
@@ -128,6 +128,12 @@ class ProgramImpl {
   virtual void finalize() {
   }
 
+  // Hook invoked by `Program::finalize()` before any teardown sync. Lets backends flip state (e.g. the LLVM
+  // `finalizing_` flag used to suppress adstack-overflow polling) so the two `Program::synchronize()` calls that
+  // precede `finalize()` do not throw into the Program destructor path.
+  virtual void pre_finalize() {
+  }
+
   virtual uint64 fetch_result_uint64(int i, uint64 *result_buffer) {
     return result_buffer[i];
   }
diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 3ce56b0b78..f794db4a54 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -238,6 +238,24 @@ std::size_t LlvmRuntimeExecutor::get_snode_num_dynamically_allocated(SNode *snod
   return (std::size_t)runtime_query<int32>("ListManager_get_num_elements", result_buffer, data_list);
 }
 
+void LlvmRuntimeExecutor::check_adstack_overflow() {
+  // Called from `synchronize()` on every sync so adstack overflow surfaces as a Python exception regardless of
+  // `compile_config.debug`. The runtime / result buffer may not exist yet (e.g. a C++ test that constructs Program
+  // without materializing the runtime and then triggers Program::finalize -> synchronize), so no-op in that case.
+  if (llvm_runtime_ == nullptr || result_buffer_cache_ == nullptr) {
+    return;
+  }
+  auto *runtime_jit_module = get_runtime_jit_module();
+  runtime_jit_module->call<void *>("runtime_retrieve_and_reset_adstack_overflow", llvm_runtime_);
+  auto flag = fetch_result<int64>(quadrants_result_buffer_error_id, result_buffer_cache_);
+  if (flag != 0) {
+    throw QuadrantsAssertionError(
+        "Adstack overflow: a reverse-mode autodiff kernel pushed more elements than the adstack capacity "
+        "allows. Raised at the next qd.sync() rather than at the offending kernel launch. Pass "
+        "ad_stack_size=N to qd.init() to raise the capacity.");
+  }
+}
+
 void LlvmRuntimeExecutor::check_runtime_error(uint64 *result_buffer) {
   synchronize();
   auto *runtime_jit_module = get_runtime_jit_module();
@@ -617,6 +635,7 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler, uint
 
   QD_TRACE("LLVMRuntime initialized (excluding `root`)");
   llvm_runtime_ = fetch_result<void *>(quadrants_result_buffer_ret_value_id, *result_buffer_ptr);
+  result_buffer_cache_ = *result_buffer_ptr;
   QD_TRACE("LLVMRuntime pointer fetched");
 
   // Preallocate for runtime memory and update to LLVMRuntime
diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.h b/quadrants/runtime/llvm/llvm_runtime_executor.h
index d0732434e3..9ace49b1fd 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.h
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.h
@@ -53,6 +53,12 @@ class LlvmRuntimeExecutor {
 
   void check_runtime_error(uint64 *result_buffer);
 
+  // Poll the runtime's adstack-overflow flag and raise if set. Unlike check_runtime_error, this runs
+  // unconditionally at every synchronize() (not gated on `compile_config.debug`) because adstack overflow silently
+  // corrupts gradients and we do not want to hide it. Safe to call before materialize_runtime() -- no-op when the
+  // cached result buffer is not yet populated.
+  void check_adstack_overflow();
+
   uint64_t *get_device_alloc_info_ptr(const DeviceAllocation &alloc);
 
   const CompileConfig &get_config() const {
@@ -132,6 +138,11 @@ class LlvmRuntimeExecutor {
   std::unique_ptr<JITSession> jit_session_{nullptr};
   JITModule *runtime_jit_module_{nullptr};
   void *llvm_runtime_{nullptr};
+  // Non-owning cache of the Program-owned result buffer so internal polls (adstack overflow, etc.) can be
+  // invoked from `synchronize()` without threading the pointer through the public API. Ownership stays with
+  // `Program` for its lifetime; reallocating or repointing `Program::result_buffer` mid-run would invalidate
+  // this cache, so avoid that.
+  uint64 *result_buffer_cache_{nullptr};
 
   std::unique_ptr<ThreadPool> thread_pool_{nullptr};
   std::shared_ptr<Device> device_{nullptr};
diff --git a/quadrants/runtime/llvm/runtime_module/internal_functions.h b/quadrants/runtime/llvm/runtime_module/internal_functions.h
index 4ad15d250c..9cac852c32 100644
--- a/quadrants/runtime/llvm/runtime_module/internal_functions.h
+++ b/quadrants/runtime/llvm/runtime_module/internal_functions.h
@@ -43,11 +43,52 @@ i32 test_internal_func_args(RuntimeContext *context, float32 i, float32 j, int32
 }
 
 i32 test_stack(RuntimeContext *context) {
-  auto stack = new u8[132];
-  stack_push(stack, 16, 4);
-  stack_push(stack, 16, 4);
-  stack_push(stack, 16, 4);
-  stack_push(stack, 16, 4);
+  auto *runtime = context->runtime;
+  // Header u64 `n` + max_num_elements * 2 * element_size for primal+adjoint slot pairs. Allocate generously for
+  // the guard-case subtests below.
+  auto stack = new u8[8 + 16 * 2 * 4];
+  stack_init(stack);
+
+  // Basic push/pop accounting.
+  stack_push(runtime, stack, 16, 4);
+  stack_push(runtime, stack, 16, 4);
+  stack_push(runtime, stack, 16, 4);
+  stack_push(runtime, stack, 16, 4);
+  QD_TEST_CHECK(*(u64 *)stack == 4, runtime);
+  QD_TEST_CHECK(runtime->adstack_overflow_flag == 0, runtime);
+
+  // stack_top_primal must point at slot (n - 1) (here: slot 3) when n > 0.
+  QD_TEST_CHECK(stack_top_primal(stack, 4) == stack + sizeof(u64) + 3 * 2 * 4, runtime);
+
+  stack_pop(stack);
+  stack_pop(stack);
+  stack_pop(stack);
+  stack_pop(stack);
+  QD_TEST_CHECK(*(u64 *)stack == 0, runtime);
+
+  // stack_pop underflow guard: extra pops past n == 0 must not wrap `n` into UINT_MAX. The runtime silently
+  // clamps at 0 instead of trapping, so the reverse pass can over-pop without corrupting subsequent kernels.
+  stack_pop(stack);
+  stack_pop(stack);
+  QD_TEST_CHECK(*(u64 *)stack == 0, runtime);
+
+  // stack_top_primal clamping: on an empty stack the top-of-stack pointer must index slot 0 (not `-1`
+  // * 2 * element_size, which would point into header territory and crash on read).
+  QD_TEST_CHECK(stack_top_primal(stack, 4) == stack + sizeof(u64), runtime);
+
+  // Push past capacity: `n` stops at max_num_elements and `adstack_overflow_flag` flips to 1.
+  for (int i = 0; i < 16; i++) {
+    stack_push(runtime, stack, 16, 4);
+  }
+  QD_TEST_CHECK(*(u64 *)stack == 16, runtime);
+  QD_TEST_CHECK(runtime->adstack_overflow_flag == 0, runtime);
+  stack_push(runtime, stack, 16, 4);  // overflow push
+  QD_TEST_CHECK(*(u64 *)stack == 16, runtime);
+  QD_TEST_CHECK(runtime->adstack_overflow_flag == 1, runtime);
+  // Reset the flag so subsequent tests in the same fixture are not poisoned.
+  runtime->adstack_overflow_flag = 0;
+
+  delete[] stack;
   return 0;
 }
 
diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp
index 7459f17f8c..41ee960dd4 100644
--- a/quadrants/runtime/llvm/runtime_module/runtime.cpp
+++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp
@@ -577,6 +577,10 @@ struct LLVMRuntime {
   uint64 error_message_arguments[quadrants_error_message_max_num_arguments];
   i32 error_message_lock = 0;
   i64 error_code = 0;
+  // Dedicated flag for adstack-overflow-specific errors. Separate from `error_code` so assertions (which set
+  // error_code=1 and are only surfaced when `compile_config.debug` is on) do not leak through the always-on poll
+  // that Program::synchronize runs.
+  i64 adstack_overflow_flag = 0;
 
   Ptr result_buffer;
   i32 allocator_lock;
@@ -709,6 +713,14 @@ void runtime_retrieve_and_reset_error_code(LLVMRuntime *runtime) {
   runtime->error_code = 0;
 }
 
+void runtime_retrieve_and_reset_adstack_overflow(LLVMRuntime *runtime) {
+  // Paired with the relaxed atomic write in `stack_push`. The host calls this only after the thread pool has
+  // joined, so strictly no synchronization is required here, but use `__atomic_exchange_n` anyway to keep the
+  // read/reset symmetric with the write and to avoid annotating the single shared field as half-atomic.
+  i64 flag = __atomic_exchange_n(&runtime->adstack_overflow_flag, (i64)0, __ATOMIC_RELAXED);
+  runtime->set_result(quadrants_result_buffer_error_id, flag);
+}
+
 void runtime_retrieve_error_message(LLVMRuntime *runtime, int i) {
   runtime->set_result(quadrants_result_buffer_error_id, runtime->error_message_template[i]);
 }
@@ -1953,9 +1965,17 @@ void quadrants_printf(LLVMRuntime *runtime, const char *format, Args &&...args)
 
 extern "C" {  // local stack operations
 
+// The stack index `n` is clamped on read so that overflow (push past capacity) does not let subsequent pops and
+// top-accesses underflow it and index far out of bounds. The corresponding stack_push sets
+// `runtime->adstack_overflow_flag` and skips the increment instead of trapping, so the host-side launcher
+// surfaces the failure as a Python exception rather than killing the process via __builtin_trap. When n == 0
+// (pop-after-overflow underflow path) we return a pointer to slot 0 - an uninitialized-but-in-bounds slot. The
+// caller will read garbage from it, but the host raises on `runtime->adstack_overflow_flag` before any such
+// value reaches user code.
 Ptr stack_top_primal(Ptr stack, std::size_t element_size) {
   auto n = *(u64 *)stack;
-  return stack + sizeof(u64) + (n - 1) * 2 * element_size;
+  std::size_t idx = n > 0 ? n - 1 : 0;
+  return stack + sizeof(u64) + idx * 2 * element_size;
 }
 
 Ptr stack_top_adjoint(Ptr stack, std::size_t element_size) {
@@ -1968,13 +1988,28 @@ void stack_init(Ptr stack) {
 
 void stack_pop(Ptr stack) {
   auto &n = *(u64 *)stack;
-  n--;
+  if (n > 0) {
+    n--;
+  }
 }
 
-void stack_push(Ptr stack, size_t max_num_elements, std::size_t element_size) {
+void stack_push(LLVMRuntime *runtime, Ptr stack, size_t max_num_elements, std::size_t element_size) {
   u64 &n = *(u64 *)stack;
+  if (n + 1 > max_num_elements) {
+    // Overflow: the loop has more iterations than the adstack capacity. Skip the push and flip the dedicated
+    // overflow flag so the host launcher throws at sync. Multiple CPU threads can hit this branch concurrently
+    // (thread pool dispatch over a multi-element field), so write the sentinel through `__atomic_store_n` with
+    // relaxed ordering: on x86-64/ARM64 this compiles to a regular naturally-aligned store, but it satisfies the
+    // C++11 memory model (plain non-atomic writes from multiple threads to the same object are a data race, even
+    // when every writer stores the same value). The host only reads the flag from `check_adstack_overflow()`
+    // after the thread pool has joined, so no ordering beyond "happens eventually" is required.
+    // `locked_task` was avoided because the AMDGPU JIT cannot retarget its host-side machinery
+    // (`hipErrorNoBinaryForGpu`). Using a separate field (not `error_code`) keeps this check distinct from
+    // assertion machinery, which is debug-gated.
+    __atomic_store_n(&runtime->adstack_overflow_flag, (i64)1, __ATOMIC_RELAXED);
+    return;
+  }
   n += 1;
-  // TODO: assert n <= max_elements
   std::memset(stack_top_primal(stack, element_size), 0, element_size * 2);
 }
 
diff --git a/quadrants/runtime/program_impls/llvm/llvm_program.h b/quadrants/runtime/program_impls/llvm/llvm_program.h
index 25f7722856..a7ff25fed2 100644
--- a/quadrants/runtime/program_impls/llvm/llvm_program.h
+++ b/quadrants/runtime/program_impls/llvm/llvm_program.h
@@ -99,7 +99,17 @@ class LlvmProgramImpl : public ProgramImpl {
     return runtime_exec_->fetch_result<T>(i, result_buffer);
   }
 
+  // Skip the adstack-overflow poll from this point on: `Program::finalize()` invokes `pre_finalize()` before the
+  // two teardown `synchronize()` calls, and we do not want `check_adstack_overflow()` to throw into a
+  // `~Program()` unwinding path - that would terminate the process with a bare `QuadrantsAssertionError` instead
+  // of letting the user handle it at their own `qd.sync()` site. The flag only affects the internal poll; the
+  // user can still call `qd.sync()` explicitly before finalize to observe the raise.
+  void pre_finalize() override {
+    finalizing_ = true;
+  }
+
   void finalize() override {
+    finalizing_ = true;
     runtime_exec_->finalize();
   }
 
@@ -150,6 +160,9 @@ class LlvmProgramImpl : public ProgramImpl {
 
   void synchronize() override {
     runtime_exec_->synchronize();
+    if (!finalizing_) {
+      runtime_exec_->check_adstack_overflow();
+    }
   }
 
   LLVMRuntime *get_llvm_runtime() {
@@ -250,6 +263,12 @@ class LlvmProgramImpl : public ProgramImpl {
   std::size_t num_snode_trees_processed_{0};
   std::unique_ptr<LlvmRuntimeExecutor> runtime_exec_;
   std::unique_ptr<LlvmOfflineCache> cache_data_;
+  // Flipped on by `pre_finalize()` (with a defensive re-assignment in `finalize()`) so the `synchronize()`
+  // override stops polling the adstack-overflow flag during teardown. `Program::finalize()` invokes
+  // `pre_finalize()` before its two teardown syncs, so the flag is already true when those syncs run; moving
+  // the assignment back into `finalize()` alone would silently re-introduce the `std::terminate()` teardown bug
+  // this field was introduced to fix.
+  bool finalizing_{false};
 };
 
 LlvmProgramImpl *get_llvm_program(Program *prog);
diff --git a/tests/python/test_adstack.py b/tests/python/test_adstack.py
index 13c56db3df..fa0a8f193e 100644
--- a/tests/python/test_adstack.py
+++ b/tests/python/test_adstack.py
@@ -1,10 +1,14 @@
 import math
 import pathlib
 import re
+import subprocess
+import sys
+import textwrap
 
 import pytest
 
 import quadrants as qd
+from quadrants.lang.misc import is_extension_supported
 
 from tests import test_utils
 
@@ -427,6 +431,160 @@ def compute():
         compute.grad()
 
 
+def _overflowing_compute(n_elements=1, n_iter=64):
+    # Shared kernel for the overflow tests. Builds `compute`, loads inputs, seeds the output gradient, and returns
+    # `(compute, x, y)` so each test can drive the grad launch and read back assertions itself. `n_iter=64` + 2
+    # adstack preamble pushes = 66 pushes, comfortably above `default_ad_stack_size=32`; `n_elements` controls how
+    # many threads run the overflowing loop in parallel.
+    x = qd.field(qd.f32)
+    y = qd.field(qd.f32)
+    qd.root.dense(qd.i, n_elements).place(x, x.grad)
+    qd.root.place(y, y.grad)
+
+    @qd.kernel
+    def compute():
+        for i in x:
+            v = x[i]
+            for _ in range(n_iter):
+                y[None] += qd.sin(v)
+                v = v + 1.0
+
+    for i in range(n_elements):
+        x[i] = 0.1 + 0.01 * i
+    y[None] = 0.0
+    compute()
+    y.grad[None] = 1.0
+    for i in range(n_elements):
+        x.grad[i] = 0.0
+    return compute, x, y
+
+
+@test_utils.test(require=qd.extension.adstack, ad_stack_size=32)
+def test_adstack_overflow_raises():
+    # Runs a backward pass with a for-loop longer than the adstack can hold, and asserts the overflow surfaces as a
+    # regular Python exception on the next `qd.sync()` - not a silent wrong gradient and not a process crash. This
+    # is what users see when their differentiable kernel is too deep for the current `ad_stack_size`, and the error
+    # message should tell them how to raise the capacity.
+    #
+    # Internal detail: both LLVM and SPIR-V defer the error to the next `qd.sync()` (same pattern as CUDA async
+    # errors) so we do not pay a sync-per-launch. LLVM polls `runtime->adstack_overflow_flag` from
+    # `LlvmProgramImpl::synchronize()` via `check_adstack_overflow()`; SPIR-V's gfx runtime raises via `QD_ERROR`
+    # on sync. The test launches the overflowing grad kernel and calls `qd.sync()` inside the same `pytest.raises`
+    # block so the deferred surfacing point is caught.
+    compute, _, _ = _overflowing_compute()
+    # On LLVM the runtime raises QuadrantsAssertionError (subclass of AssertionError) from
+    # check_adstack_overflow; on SPIR-V the gfx runtime raises RuntimeError via QD_ERROR. We accept either,
+    # matching only the message prefix.
+    with pytest.raises((AssertionError, RuntimeError), match=r"[Aa]dstack overflow"):
+        compute.grad()
+        qd.sync()
+
+
+@test_utils.test(require=qd.extension.adstack, ad_stack_size=32)
+def test_adstack_overflow_flag_resets_after_catch():
+    # Once `check_adstack_overflow()` raises, the runtime must clear its overflow flag so a subsequent `qd.sync()`
+    # (with no new overflowing grad launch in between) returns normally. Without the reset the user would see a
+    # stale overflow exception every time they sync after the first one, which makes diagnosis and recovery
+    # impossible.
+    compute, _, _ = _overflowing_compute()
+    with pytest.raises((AssertionError, RuntimeError), match=r"[Aa]dstack overflow"):
+        compute.grad()
+        qd.sync()
+    # No new grad launch here - the flag must already be back to zero.
+    qd.sync()
+
+
+@test_utils.test(require=qd.extension.adstack, ad_stack_size=1024)
+def test_adstack_large_capacity_resolves_overflow():
+    # Same kernel shape as `test_adstack_overflow_raises`, but with `ad_stack_size=1024` explicitly passed to
+    # `qd.init()`. Asserts that raising the capacity (rather than shrinking the loop) is a valid workaround and
+    # that the backward pass runs to completion with a correct gradient. This is the remediation path the overflow
+    # error message points users at.
+    compute, x, _ = _overflowing_compute()
+    compute.grad()
+    qd.sync()
+
+    # y += sin(v) iterated with v = x[0] + k for k = 0..63, so dy/dx[0] = sum_k cos(x[0] + k).
+    expected = sum(math.cos(0.1 + k) for k in range(64))
+    assert x.grad[0] == test_utils.approx(expected, rel=1e-3)
+
+
+@test_utils.test(require=qd.extension.adstack, ad_stack_size=32)
+def test_adstack_overflow_multithreaded():
+    # Multi-element field so several threads execute the overflowing grad body in parallel. Asserts the overflow
+    # still surfaces as a single Python exception rather than deadlocking, crashing, or racing on the flag. Every
+    # thread writes the same flag value (non-zero), so a race on the write is benign; this test pins that the
+    # read side is also safe (one raise per sync regardless of how many threads flipped the bit).
+    compute, _, _ = _overflowing_compute(n_elements=16)
+    with pytest.raises((AssertionError, RuntimeError), match=r"[Aa]dstack overflow"):
+        compute.grad()
+        qd.sync()
+
+
+def test_adstack_overflow_during_teardown_does_not_abort(tmp_path):
+    # This test runs the kernel in a child process (not via `@test_utils.test`, which iterates arches), so it
+    # cannot rely on the decorator's `require=qd.extension.adstack` skip. Guard manually: skip if the CPU backend
+    # was not built with the adstack extension, matching what the sibling overflow tests get from the decorator.
+    if not is_extension_supported(qd.cpu, qd.extension.adstack):
+        pytest.skip("adstack extension not available on cpu")
+
+    # If a user launches an overflowing grad kernel and never calls `qd.sync()` before the process exits, the
+    # adstack-overflow flag is still set when Python interpreter teardown invokes `Program::finalize()`. The two
+    # teardown syncs inside `Program::finalize()` must not re-raise a `QuadrantsAssertionError` into the
+    # destructor path - doing so would terminate the process with `std::terminate()` instead of returning a clean
+    # exit code. A subprocess runs the overflowing-grad kernel without calling `qd.sync()` at all and exits; this
+    # test asserts that the child returns with exit code 0 rather than SIGABRT (-6) or any other non-zero code.
+    #
+    # Internal details: `Program::finalize()` invokes `program_impl_->pre_finalize()` before the two teardown
+    # `synchronize()` calls. `LlvmProgramImpl::pre_finalize()` sets `finalizing_ = true` so
+    # `LlvmProgramImpl::synchronize()` short-circuits `check_adstack_overflow()`. Note the flag must be set
+    # *before* those syncs run - setting it only inside `LlvmProgramImpl::finalize()` (which is dispatched after
+    # them) is too late. The subprocess is launched from a temp file because `python -c "<kernel>"` breaks
+    # Quadrants' kernel source-inspect (`getsourcelines` cannot find the source of an inlined `-c` string); the
+    # grad call is deliberately left unsynced so this is the teardown path, not the user-catch path.
+    child_script = textwrap.dedent(
+        """
+        import quadrants as qd
+
+        qd.init(arch=qd.cpu, ad_stack_experimental_enabled=True, ad_stack_size=32)
+
+        x = qd.field(qd.f32)
+        y = qd.field(qd.f32)
+        qd.root.dense(qd.i, 1).place(x, x.grad)
+        qd.root.place(y, y.grad)
+
+        @qd.kernel
+        def compute():
+            for i in x:
+                v = x[i]
+                for _ in range(64):
+                    y[None] += qd.sin(v)
+                    v = v + 1.0
+
+        x[0] = 0.1
+        y[None] = 0.0
+        compute()
+        y.grad[None] = 1.0
+        x.grad[0] = 0.0
+        compute.grad()
+        # Intentionally no qd.sync() and no try/except here: the adstack-overflow flag is left set when the
+        # process exits, so teardown must swallow it via the `finalizing_` guard rather than re-raising.
+        """
+    )
+    script_path = tmp_path / "overflow_teardown_child.py"
+    script_path.write_text(child_script)
+    # No `timeout=` on subprocess.run: pytest's own per-test timeout (`--timeout=...` in CI and locally) already
+    # terminates the whole test if the child deadlocks. Adding a second timeout here would only duplicate that
+    # safety net with a different failure mode (TimeoutExpired vs pytest-timeout's clean teardown).
+    result = subprocess.run([sys.executable, str(script_path)], capture_output=True, check=False)
+    if result.returncode != 0:
+        raise AssertionError(
+            f"child exited with {result.returncode}\n"
+            f"stdout:\n{result.stdout.decode()}\n"
+            f"stderr:\n{result.stderr.decode()}"
+        )
+
+
 def _run_sum_linear(
     qd_dtype, use_static_loop, use_varying_coeff, n_iter, rel_tol, approx=test_utils.approx, abs_tol=None
 ):