From ab15b1b82c1cc2ef2d0029db9faf913ce4ef2145 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 16:40:10 -0700
Subject: [PATCH 01/39] Add CUDA stream and event API for concurrent kernel
 execution

Introduces qd.create_stream() and qd.create_event() for launching
kernels on separate CUDA streams with event-based synchronization.
The qd_stream kwarg on kernel calls routes the launch to a specific
stream. Non-CUDA backends return no-op handles (0). Routes kernel
launcher memory ops through the active stream.
---
 python/quadrants/lang/__init__.py             |   2 +
 python/quadrants/lang/kernel.py               |  16 +-
 python/quadrants/lang/stream.py               |  96 +++++++++
 quadrants/program/program.cpp                 |  93 +++++++++
 quadrants/program/program.h                   |  10 +
 quadrants/python/export_lang.cpp              |  11 +-
 .../rhi/cuda/cuda_driver_functions.inc.h      |   2 +
 quadrants/runtime/cuda/kernel_launcher.cpp    |  20 +-
 tests/python/test_api.py                      |   4 +
 tests/python/test_cache.py                    |   8 +-
 tests/python/test_streams.py                  | 197 ++++++++++++++++++
 11 files changed, 443 insertions(+), 16 deletions(-)
 create mode 100644 python/quadrants/lang/stream.py
 create mode 100644 tests/python/test_streams.py

diff --git a/python/quadrants/lang/__init__.py b/python/quadrants/lang/__init__.py
index dc4fb2cf19..43a4b44b89 100644
--- a/python/quadrants/lang/__init__.py
+++ b/python/quadrants/lang/__init__.py
@@ -15,6 +15,7 @@
 from quadrants.lang.runtime_ops import *
 from quadrants.lang.snode import *
 from quadrants.lang.source_builder import *
+from quadrants.lang.stream import *
 from quadrants.lang.struct import *
 from quadrants.types.enums import DeviceCapability, Format, Layout  # noqa: F401
 
@@ -45,6 +46,7 @@
         "shell",
         "snode",
         "source_builder",
+        "stream",
         "struct",
         "util",
     ]
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index af6dbdacb5..4b1578ac4b 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -424,7 +424,9 @@ def materialize(self, key: "CompiledKernelKeyType | None", py_args: tuple[Any, .
                     ]
                 runtime._current_global_context = None
 
-    def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args) -> Any:
+    def launch_kernel(
+        self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args, qd_stream=None
+    ) -> Any:
         assert len(args) == len(self.arg_metas), f"{len(self.arg_metas)} arguments needed but {len(args)} provided"
 
         callbacks: list[Callable[[], None]] = []
@@ -503,7 +505,14 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     )
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
-            prog.launch_kernel(compiled_kernel_data, launch_ctx)
+            stream_handle = qd_stream.handle if qd_stream is not None else 0
+            if stream_handle:
+                prog.set_current_cuda_stream(stream_handle)
+            try:
+                prog.launch_kernel(compiled_kernel_data, launch_ctx)
+            finally:
+                if stream_handle:
+                    prog.set_current_cuda_stream(0)
         except Exception as e:
             e = handle_exception_from_cpp(e)
             if impl.get_runtime().print_full_traceback:
@@ -547,6 +556,7 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     # Thus this part needs to be fast. (i.e. < 3us on a 4 GHz x64 CPU)
     @_shell_pop_print
     def __call__(self, *py_args, **kwargs) -> Any:
+        qd_stream = kwargs.pop("qd_stream", None)
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()
@@ -578,7 +588,7 @@ def __call__(self, *py_args, **kwargs) -> Any:
         kernel_cpp = self.materialized_kernels[key]
         compiled_kernel_data = self.compiled_kernel_data_by_key.get(key, None)
         self.launch_observations.found_kernel_in_materialize_cache = compiled_kernel_data is not None
-        ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args)
+        ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args, qd_stream=qd_stream)
         if compiled_kernel_data is None:
             assert self._last_compiled_kernel_data is not None
             self.compiled_kernel_data_by_key[key] = self._last_compiled_kernel_data
diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
new file mode 100644
index 0000000000..8530982455
--- /dev/null
+++ b/python/quadrants/lang/stream.py
@@ -0,0 +1,96 @@
+from quadrants.lang import impl
+
+
+class Stream:
+    """Wraps a backend-specific GPU stream for concurrent kernel execution.
+
+    On backends without native streams (e.g. CPU), this is a no-op object.
+    """
+
+    def __init__(self, handle: int):
+        self._handle = handle
+
+    @property
+    def handle(self) -> int:
+        return self._handle
+
+    def synchronize(self):
+        """Block until all operations on this stream complete."""
+        prog = impl.get_runtime().prog
+        prog.stream_synchronize(self._handle)
+
+    def destroy(self):
+        """Explicitly destroy the stream. Safe to call multiple times."""
+        if self._handle != 0:
+            prog = impl.get_runtime().prog
+            prog.stream_destroy(self._handle)
+            self._handle = 0
+
+    def __del__(self):
+        if self._handle != 0:
+            try:
+                self.destroy()
+            except Exception:
+                pass
+
+
+class Event:
+    """Wraps a backend-specific GPU event for stream synchronization.
+
+    On backends without native events (e.g. CPU), this is a no-op object.
+    """
+
+    def __init__(self, handle: int):
+        self._handle = handle
+
+    @property
+    def handle(self) -> int:
+        return self._handle
+
+    def record(self, stream: Stream | None = None):
+        """Record this event on a stream. None means the default stream."""
+        prog = impl.get_runtime().prog
+        stream_handle = stream.handle if stream is not None else 0
+        prog.event_record(self._handle, stream_handle)
+
+    def wait(self, qd_stream: Stream | None = None):
+        """Make a stream wait for this event. None means the default stream."""
+        prog = impl.get_runtime().prog
+        stream_handle = qd_stream.handle if qd_stream is not None else 0
+        prog.stream_wait_event(stream_handle, self._handle)
+
+    def synchronize(self):
+        """Block the host until this event has been reached."""
+        prog = impl.get_runtime().prog
+        prog.event_synchronize(self._handle)
+
+    def destroy(self):
+        """Explicitly destroy the event. Safe to call multiple times."""
+        if self._handle != 0:
+            prog = impl.get_runtime().prog
+            prog.event_destroy(self._handle)
+            self._handle = 0
+
+    def __del__(self):
+        if self._handle != 0:
+            try:
+                self.destroy()
+            except Exception:
+                pass
+
+
+def create_stream() -> Stream:
+    """Create a new GPU stream for concurrent kernel execution."""
+    prog = impl.get_runtime().prog
+    handle = prog.stream_create()
+    return Stream(handle)
+
+
+def create_event() -> Event:
+    """Create a new GPU event for stream synchronization."""
+    prog = impl.get_runtime().prog
+    handle = prog.event_create()
+    return Event(handle)
+
+
+__all__ = ["Stream", "Event", "create_stream", "create_event"]
diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 7f5dfef2d8..9b2ff0886b 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -20,6 +20,11 @@
 #include "quadrants/codegen/llvm/struct_llvm.h"
 #endif
 
+#ifdef QD_WITH_CUDA
+#include "quadrants/rhi/cuda/cuda_driver.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+#endif
+
 #ifdef QD_WITH_VULKAN
 #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h"
 #include "quadrants/rhi/vulkan/vulkan_loader.h"
@@ -481,4 +486,92 @@ void Program::enqueue_compute_op_lambda(
   program_impl_->enqueue_compute_op_lambda(op, image_refs);
 }
 
+uint64 Program::stream_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    void *stream = nullptr;
+    CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    return reinterpret_cast<uint64>(stream);
+  }
+#endif
+  return 0;
+}
+
+void Program::stream_destroy(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDADriver::get_instance().stream_destroy(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::stream_synchronize(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDADriver::get_instance().stream_synchronize(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::set_current_cuda_stream(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().set_stream(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+uint64 Program::event_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    void *event = nullptr;
+    CUDADriver::get_instance().event_create(&event,
+                                            0x02 /*CU_EVENT_DISABLE_TIMING*/);
+    return reinterpret_cast<uint64>(event);
+  }
+#endif
+  return 0;
+}
+
+void Program::event_destroy(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().event_destroy(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::event_record(uint64 event_handle, uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().event_record(
+        reinterpret_cast<void *>(event_handle),
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::event_synchronize(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().event_synchronize(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().stream_wait_event(
+        reinterpret_cast<void *>(stream_handle),
+        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+  }
+#endif
+}
+
 }  // namespace quadrants::lang
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 1fa2c2ac57..9568c371c8 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -300,6 +300,16 @@ class QD_DLL_EXPORT Program {
     return ndarrays_.size();
   }
 
+  uint64 stream_create();
+  void stream_destroy(uint64 stream_handle);
+  void stream_synchronize(uint64 stream_handle);
+  void set_current_cuda_stream(uint64 stream_handle);
+  uint64 event_create();
+  void event_destroy(uint64 event_handle);
+  void event_record(uint64 event_handle, uint64 stream_handle);
+  void event_synchronize(uint64 event_handle);
+  void stream_wait_event(uint64 stream_handle, uint64 event_handle);
+
   // TODO(zhanlue): Move these members and corresponding interfaces to
   // ProgramImpl Ideally, Program should serve as a pure interface class and all
   // the implementations should fall inside ProgramImpl
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index b3d23c0037..2f5da8b1b4 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -495,7 +495,16 @@ void export_lang(py::module &m) {
       .def("compile_kernel", &Program::compile_kernel,
            py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
-      .def("get_device_caps", &Program::get_device_caps);
+      .def("get_device_caps", &Program::get_device_caps)
+      .def("stream_create", &Program::stream_create)
+      .def("stream_destroy", &Program::stream_destroy)
+      .def("stream_synchronize", &Program::stream_synchronize)
+      .def("set_current_cuda_stream", &Program::set_current_cuda_stream)
+      .def("event_create", &Program::event_create)
+      .def("event_destroy", &Program::event_destroy)
+      .def("event_record", &Program::event_record)
+      .def("event_synchronize", &Program::event_synchronize)
+      .def("stream_wait_event", &Program::stream_wait_event);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
index 25b3c7958e..a9690ca10b 100644
--- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h
+++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -20,6 +20,7 @@ PER_CUDA_FUNCTION(context_set_limit, cuCtxSetLimit, int, std::size_t);
 
 // Stream management
 PER_CUDA_FUNCTION(stream_create, cuStreamCreate, void **, uint32);
+PER_CUDA_FUNCTION(stream_destroy, cuStreamDestroy_v2, void *);
 
 // Memory management
 PER_CUDA_FUNCTION(memcpy_host_to_device, cuMemcpyHtoD_v2, void *, void *, std::size_t);
@@ -52,6 +53,7 @@ PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_a
 
 // Stream management
 PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *);
+PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32);
 
 // Event management
 PER_CUDA_FUNCTION(event_create, cuEventCreate, void **, uint32)
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 5eae5e747d..13845d5a9b 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,5 +1,6 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
+#include "quadrants/rhi/cuda/cuda_driver.h"
 
 namespace quadrants::lang {
 namespace cuda {
@@ -43,10 +44,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
   // kernels.
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
+  auto *active_stream = CUDAContext::get_instance().get_stream();
+
   char *device_result_buffer{nullptr};
   CUDADriver::get_instance().malloc_async(
       (void **)&device_result_buffer,
-      std::max(ctx.result_buffer_size, sizeof(uint64)), nullptr);
+      std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream);
   ctx.get_context().runtime = executor->get_llvm_runtime();
 
   for (int i = 0; i < (int)parameters.size(); i++) {
@@ -120,7 +123,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     }
   }
   if (transfers.size() > 0) {
-    CUDADriver::get_instance().stream_synchronize(nullptr);
+    CUDADriver::get_instance().stream_synchronize(active_stream);
   }
   char *host_result_buffer = (char *)ctx.get_context().result_buffer;
   if (ctx.result_buffer_size > 0) {
@@ -129,10 +132,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
   char *device_arg_buffer = nullptr;
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().malloc_async((void **)&device_arg_buffer,
-                                            ctx.arg_buffer_size, nullptr);
+                                            ctx.arg_buffer_size, active_stream);
     CUDADriver::get_instance().memcpy_host_to_device_async(
         device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size,
-        nullptr);
+        active_stream);
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
 
@@ -144,17 +147,18 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                         {});
   }
   if (ctx.arg_buffer_size > 0) {
-    CUDADriver::get_instance().mem_free_async(device_arg_buffer, nullptr);
+    CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream);
   }
   if (ctx.result_buffer_size > 0) {
     CUDADriver::get_instance().memcpy_device_to_host_async(
         host_result_buffer, device_result_buffer, ctx.result_buffer_size,
-        nullptr);
+        active_stream);
   }
-  CUDADriver::get_instance().mem_free_async(device_result_buffer, nullptr);
+  CUDADriver::get_instance().mem_free_async(device_result_buffer,
+                                            active_stream);
   // copy data back to host
   if (transfers.size() > 0) {
-    CUDADriver::get_instance().stream_synchronize(nullptr);
+    CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
       CUDADriver::get_instance().memcpy_device_to_host(
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index cf12abc393..002014c960 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -59,6 +59,7 @@ def _get_expected_matrix_apis():
     "DEBUG",
     "DeviceCapability",
     "ERROR",
+    "Event",
     "Field",
     "FieldsBuilder",
     "Format",
@@ -73,6 +74,7 @@ def _get_expected_matrix_apis():
     "SNode",
     "ScalarField",
     "ScalarNdarray",
+    "Stream",
     "Struct",
     "StructField",
     "TRACE",
@@ -117,6 +119,8 @@ def _get_expected_matrix_apis():
     "clock_freq_hz",
     "cos",
     "cpu",
+    "create_event",
+    "create_stream",
     "cuda",
     "data_oriented",
     "dataclass",
diff --git a/tests/python/test_cache.py b/tests/python/test_cache.py
index c3821e44c5..e31daf61e7 100644
--- a/tests/python/test_cache.py
+++ b/tests/python/test_cache.py
@@ -216,11 +216,11 @@ def test_fastcache(tmp_path: pathlib.Path, monkeypatch):
     qd_init_same_arch(offline_cache_file_path=str(tmp_path), offline_cache=True)
     is_valid = False
 
-    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args):
+    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=None):
         nonlocal is_valid
         is_valid = True
         assert compiled_kernel_data is None
-        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args)
+        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=qd_stream)
 
     monkeypatch.setattr("quadrants.lang.kernel_impl.Kernel.launch_kernel", launch_kernel)
 
@@ -242,11 +242,11 @@ def fun(value: qd.types.ndarray(), offset: qd.template()):
     qd_init_same_arch(offline_cache_file_path=str(tmp_path), offline_cache=True)
     is_valid = False
 
-    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args):
+    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=None):
         nonlocal is_valid
         is_valid = True
         assert compiled_kernel_data is not None
-        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args)
+        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=qd_stream)
 
     monkeypatch.setattr("quadrants.lang.kernel_impl.Kernel.launch_kernel", launch_kernel)
 
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
new file mode 100644
index 0000000000..fabc217e96
--- /dev/null
+++ b/tests/python/test_streams.py
@@ -0,0 +1,197 @@
+"""Tests for GPU stream and event support."""
+
+import numpy as np
+
+import quadrants as qd
+from quadrants.lang.stream import Event, Stream
+
+from tests import test_utils
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_create_and_destroy_stream():
+    s = qd.create_stream()
+    assert isinstance(s, Stream)
+    assert s.handle != 0
+    s.destroy()
+    assert s.handle == 0
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_create_and_destroy_event():
+    e = qd.create_event()
+    assert isinstance(e, Event)
+    assert e.handle != 0
+    e.destroy()
+    assert e.handle == 0
+
+
+@test_utils.test()
+def test_kernel_on_stream():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 42.0
+
+    s = qd.create_stream()
+    fill(qd_stream=s)
+    s.synchronize()
+    assert np.allclose(x.to_numpy(), 42.0)
+    s.destroy()
+
+
+@test_utils.test()
+def test_two_streams():
+    N = 1024
+    a = qd.field(qd.f32, shape=(N,))
+    b = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill_a():
+        for i in range(N):
+            a[i] = 1.0
+
+    @qd.kernel
+    def fill_b():
+        for i in range(N):
+            b[i] = 2.0
+
+    s1 = qd.create_stream()
+    s2 = qd.create_stream()
+    fill_a(qd_stream=s1)
+    fill_b(qd_stream=s2)
+    s1.synchronize()
+    s2.synchronize()
+    assert np.allclose(a.to_numpy(), 1.0)
+    assert np.allclose(b.to_numpy(), 2.0)
+    s1.destroy()
+    s2.destroy()
+
+
+@test_utils.test()
+def test_event_synchronization():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+    y = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill_x():
+        for i in range(N):
+            x[i] = 10.0
+
+    @qd.kernel
+    def copy_x_to_y():
+        for i in range(N):
+            y[i] = x[i]
+
+    s1 = qd.create_stream()
+    fill_x(qd_stream=s1)
+
+    e = qd.create_event()
+    e.record(s1)
+
+    # Default stream waits for s1 to finish fill_x
+    e.wait()
+    copy_x_to_y()
+    qd.sync()
+
+    assert np.allclose(y.to_numpy(), 10.0)
+
+    e.destroy()
+    s1.destroy()
+
+
+@test_utils.test()
+def test_event_wait_on_stream():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+    y = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill_x():
+        for i in range(N):
+            x[i] = 5.0
+
+    @qd.kernel
+    def copy_x_to_y():
+        for i in range(N):
+            y[i] = x[i]
+
+    s1 = qd.create_stream()
+    s2 = qd.create_stream()
+
+    fill_x(qd_stream=s1)
+
+    e = qd.create_event()
+    e.record(s1)
+
+    # s2 waits for s1's event before running
+    e.wait(qd_stream=s2)
+    copy_x_to_y(qd_stream=s2)
+    s2.synchronize()
+
+    assert np.allclose(y.to_numpy(), 5.0)
+
+    e.destroy()
+    s1.destroy()
+    s2.destroy()
+
+
+@test_utils.test()
+def test_default_stream_kernel():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 7.0
+
+    fill()
+    qd.sync()
+    assert np.allclose(x.to_numpy(), 7.0)
+
+
+@test_utils.test(arch=[qd.cpu])
+def test_stream_noop_on_cpu():
+    """Streams should be no-ops on CPU without errors."""
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 3.0
+
+    s = qd.create_stream()
+    assert s.handle == 0
+    fill(qd_stream=s)
+    qd.sync()
+    assert np.allclose(x.to_numpy(), 3.0)
+
+    e = qd.create_event()
+    assert e.handle == 0
+    e.record(s)
+    e.wait()
+    s.destroy()
+    e.destroy()
+
+
+@test_utils.test()
+def test_stream_with_ndarray():
+    N = 1024
+
+    @qd.kernel
+    def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        for i in range(N):
+            arr[i] = 99.0
+
+    arr = qd.ndarray(qd.f32, shape=(N,))
+    s = qd.create_stream()
+    fill(arr, qd_stream=s)
+    s.synchronize()
+    assert np.allclose(arr.to_numpy(), 99.0)
+    s.destroy()

From b856b33247dfbb55ca5f781e788fc50d5e32c9e9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:25:18 -0700
Subject: [PATCH 02/39] Address review feedback for CUDA streams PR

- Make CUDAContext::stream_ thread_local for thread-safety
- Convert sync memcpy_host_to_device to async on active_stream
- Use weakref in Stream/Event __del__ to safely handle interpreter shutdown
- Add __enter__/__exit__ context manager support for Stream and Event
- Use consistent qd_stream parameter naming in Event.record and Event.wait
- Add handle==0 guard to stream_synchronize
---
 python/quadrants/lang/stream.py            | 60 ++++++++++++++++------
 quadrants/program/program.cpp              |  2 +-
 quadrants/rhi/cuda/cuda_context.cpp        |  6 +--
 quadrants/rhi/cuda/cuda_context.h          |  2 +-
 quadrants/runtime/cuda/kernel_launcher.cpp | 10 ++--
 5 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 8530982455..8f6cfab3d6 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -1,14 +1,22 @@
+import weakref
+
 from quadrants.lang import impl
 
 
+def _get_prog_weakref():
+    return weakref.ref(impl.get_runtime().prog)
+
+
 class Stream:
     """Wraps a backend-specific GPU stream for concurrent kernel execution.
 
     On backends without native streams (e.g. CPU), this is a no-op object.
+    Call destroy() explicitly or use as a context manager to ensure cleanup.
     """
 
-    def __init__(self, handle: int):
+    def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
         self._handle = handle
+        self._prog_ref = prog_ref
 
     @property
     def handle(self) -> int:
@@ -27,30 +35,41 @@ def destroy(self):
             self._handle = 0
 
     def __del__(self):
-        if self._handle != 0:
-            try:
-                self.destroy()
-            except Exception:
-                pass
+        if self._handle != 0 and self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                try:
+                    prog.stream_destroy(self._handle)
+                    self._handle = 0
+                except Exception:
+                    pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.destroy()
 
 
 class Event:
     """Wraps a backend-specific GPU event for stream synchronization.
 
     On backends without native events (e.g. CPU), this is a no-op object.
+    Call destroy() explicitly or use as a context manager to ensure cleanup.
     """
 
-    def __init__(self, handle: int):
+    def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
         self._handle = handle
+        self._prog_ref = prog_ref
 
     @property
     def handle(self) -> int:
         return self._handle
 
-    def record(self, stream: Stream | None = None):
+    def record(self, qd_stream: Stream | None = None):
         """Record this event on a stream. None means the default stream."""
         prog = impl.get_runtime().prog
-        stream_handle = stream.handle if stream is not None else 0
+        stream_handle = qd_stream.handle if qd_stream is not None else 0
         prog.event_record(self._handle, stream_handle)
 
     def wait(self, qd_stream: Stream | None = None):
@@ -72,25 +91,34 @@ def destroy(self):
             self._handle = 0
 
     def __del__(self):
-        if self._handle != 0:
-            try:
-                self.destroy()
-            except Exception:
-                pass
+        if self._handle != 0 and self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                try:
+                    prog.event_destroy(self._handle)
+                    self._handle = 0
+                except Exception:
+                    pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.destroy()
 
 
 def create_stream() -> Stream:
     """Create a new GPU stream for concurrent kernel execution."""
     prog = impl.get_runtime().prog
     handle = prog.stream_create()
-    return Stream(handle)
+    return Stream(handle, _get_prog_weakref())
 
 
 def create_event() -> Event:
     """Create a new GPU event for stream synchronization."""
     prog = impl.get_runtime().prog
     handle = prog.event_create()
-    return Event(handle)
+    return Event(handle, _get_prog_weakref())
 
 
 __all__ = ["Stream", "Event", "create_stream", "create_event"]
diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 9b2ff0886b..be152d02da 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -508,7 +508,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 
 void Program::stream_synchronize(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
     CUDADriver::get_instance().stream_synchronize(
         reinterpret_cast<void *>(stream_handle));
   }
diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index 89c16135a2..23399649a9 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -11,10 +11,10 @@
 
 namespace quadrants::lang {
 
+thread_local void *CUDAContext::stream_ = nullptr;
+
 CUDAContext::CUDAContext()
-    : profiler_(nullptr),
-      driver_(CUDADriver::get_instance_without_context()),
-      stream_(nullptr) {
+    : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) {
   // CUDA initialization
   dev_count_ = 0;
   driver_.init(0);
diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
index c57baa3d92..ba891644a7 100644
--- a/quadrants/rhi/cuda/cuda_context.h
+++ b/quadrants/rhi/cuda/cuda_context.h
@@ -30,7 +30,7 @@ class CUDAContext {
   int max_shared_memory_bytes_;
   bool debug_;
   bool supports_mem_pool_;
-  void *stream_;
+  static thread_local void *stream_;
 
  public:
   CUDAContext();
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 13845d5a9b..9bbf75044e 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -85,8 +85,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
               executor->get_device_alloc_info_ptr(devalloc);
           transfers[data_ptr_idx] = {data_ptr, devalloc};
 
-          CUDADriver::get_instance().memcpy_host_to_device(
-              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz);
+          CUDADriver::get_instance().memcpy_host_to_device_async(
+              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz,
+              active_stream);
           if (grad_ptr != nullptr) {
             DeviceAllocation grad_devalloc =
                 executor->allocate_memory_on_device(
@@ -95,8 +96,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                 executor->get_device_alloc_info_ptr(grad_devalloc);
             transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc};
 
-            CUDADriver::get_instance().memcpy_host_to_device(
-                (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz);
+            CUDADriver::get_instance().memcpy_host_to_device_async(
+                (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz,
+                active_stream);
           } else {
             device_ptrs[grad_ptr_idx] = nullptr;
           }

From 9be110daf54838a2da4a430e254e25afdfb198e9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 19 Apr 2026 17:24:53 -0700
Subject: [PATCH 03/39] Apply clang-format

Made-with: Cursor
---
 quadrants/program/program.cpp              | 28 ++++++++--------------
 quadrants/rhi/cuda/cuda_context.cpp        |  3 +--
 quadrants/runtime/cuda/kernel_launcher.cpp |  3 +--
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index a591fb8dba..ec5a9fa57d 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -474,8 +474,7 @@ uint64 Program::stream_create() {
 void Program::stream_destroy(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDADriver::get_instance().stream_destroy(
-        reinterpret_cast<void *>(stream_handle));
+    CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -483,8 +482,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 void Program::stream_synchronize(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDADriver::get_instance().stream_synchronize(
-        reinterpret_cast<void *>(stream_handle));
+    CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -492,8 +490,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 void Program::set_current_cuda_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().set_stream(
-        reinterpret_cast<void *>(stream_handle));
+    CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -502,8 +499,7 @@ uint64 Program::event_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
     void *event = nullptr;
-    CUDADriver::get_instance().event_create(&event,
-                                            0x02 /*CU_EVENT_DISABLE_TIMING*/);
+    CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
     return reinterpret_cast<uint64>(event);
   }
 #endif
@@ -513,8 +509,7 @@ uint64 Program::event_create() {
 void Program::event_destroy(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().event_destroy(
-        reinterpret_cast<void *>(event_handle));
+    CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -522,9 +517,8 @@ void Program::event_destroy(uint64 event_handle) {
 void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().event_record(
-        reinterpret_cast<void *>(event_handle),
-        reinterpret_cast<void *>(stream_handle));
+    CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
+                                            reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -532,8 +526,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 void Program::event_synchronize(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().event_synchronize(
-        reinterpret_cast<void *>(event_handle));
+    CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -541,9 +534,8 @@ void Program::event_synchronize(uint64 event_handle) {
 void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().stream_wait_event(
-        reinterpret_cast<void *>(stream_handle),
-        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+    CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
+                                                 reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }
 #endif
 }
diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index a605d06c64..60553da9c7 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -13,8 +13,7 @@ namespace quadrants::lang {
 
 thread_local void *CUDAContext::stream_ = nullptr;
 
-CUDAContext::CUDAContext()
-    : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) {
+CUDAContext::CUDAContext() : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) {
   // CUDA initialization
   dev_count_ = 0;
   driver_.init(0);
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 34905218f9..0c5d7e9458 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -173,8 +173,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     CUDADriver::get_instance().memcpy_device_to_host_async(host_result_buffer, device_result_buffer,
                                                            ctx.result_buffer_size, active_stream);
   }
-  CUDADriver::get_instance().mem_free_async(device_result_buffer,
-                                            active_stream);
+  CUDADriver::get_instance().mem_free_async(device_result_buffer, active_stream);
   // copy data back to host
   if (transfers.size() > 0) {
     CUDADriver::get_instance().stream_synchronize(active_stream);

From d3cae3cbaa1a3ffd832a30320fa59c5af753e595 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 24 Apr 2026 02:18:49 -0700
Subject: [PATCH 04/39] [Test] Exclude flaky test_perf_dispatch_python from
 Vulkan

The pure-Python perf dispatch test is timing-sensitive and unreliable on
the Vulkan software renderer in CI. The kernel variant of the same test
still covers perf dispatch on Vulkan.

Made-with: Cursor
---
 tests/python/test_perf_dispatch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/test_perf_dispatch.py b/tests/python/test_perf_dispatch.py
index eaef03d99f..b533105c42 100644
--- a/tests/python/test_perf_dispatch.py
+++ b/tests/python/test_perf_dispatch.py
@@ -109,7 +109,7 @@ def my_func1_impl_a_shape0_ge_2(
     assert len(speed_checker._trial_count_by_dispatch_impl_by_geometry_hash[geometry]) == 2
 
 
-@test_utils.test()
+@test_utils.test(exclude=[qd.vulkan])
 def test_perf_dispatch_python() -> None:
     WARMUP = 3
 

From cd5b486beab0fc878652d4d2d043f44f1bd58e12 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 08:35:22 -0700
Subject: [PATCH 05/39] [Doc] Add user guide for streams API

---
 docs/source/user_guide/index.md   |   1 +
 docs/source/user_guide/streams.md | 145 ++++++++++++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 docs/source/user_guide/streams.md

diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index 05a5dfc434..7775e56f0e 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -54,6 +54,7 @@ tile16
 :titlesonly:
 
 graph
+streams
 perf_dispatch
 ```
 
diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
new file mode 100644
index 0000000000..0a610fd217
--- /dev/null
+++ b/docs/source/user_guide/streams.md
@@ -0,0 +1,145 @@
+# Streams
+
+Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default
+stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently
+and control synchronization with events.
+
+## Supported platforms
+
+| Backend | Streams | Events | Notes |
+|---------|---------|--------|-------|
+| CUDA    | Yes     | Yes    | Full concurrent execution |
+| CPU     | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+| Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+| Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+
+On backends without native stream support, `create_stream()` and `create_event()` return objects with handle
+`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
+
+## Creating and using streams
+
+```python
+import quadrants as qd
+
+qd.init(arch=qd.cuda)
+
+N = 1024
+a = qd.field(qd.f32, shape=(N,))
+b = qd.field(qd.f32, shape=(N,))
+
+@qd.kernel
+def fill_a():
+    for i in range(N):
+        a[i] = 1.0
+
+@qd.kernel
+def fill_b():
+    for i in range(N):
+        b[i] = 2.0
+
+s1 = qd.create_stream()
+s2 = qd.create_stream()
+
+fill_a(qd_stream=s1)
+fill_b(qd_stream=s2)
+
+s1.synchronize()
+s2.synchronize()
+
+s1.destroy()
+s2.destroy()
+```
+
+Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute
+concurrently. Call `synchronize()` to block until all work on a stream completes.
+
+## Events
+
+Events let you express dependencies between streams without full synchronization.
+
+```python
+s1 = qd.create_stream()
+s2 = qd.create_stream()
+
+@qd.kernel
+def produce():
+    for i in range(N):
+        a[i] = 10.0
+
+@qd.kernel
+def consume():
+    for i in range(N):
+        b[i] = a[i]
+
+produce(qd_stream=s1)
+
+e = qd.create_event()
+e.record(s1)       # record when s1 finishes produce()
+e.wait(qd_stream=s2)  # s2 waits for that event before proceeding
+
+consume(qd_stream=s2)  # safe to read a[] — produce() is guaranteed complete
+s2.synchronize()
+
+e.destroy()
+s1.destroy()
+s2.destroy()
+```
+
+`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait
+until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
+
+## Context managers
+
+Streams and events support `with` blocks for automatic cleanup:
+
+```python
+with qd.create_stream() as s:
+    fill_a(qd_stream=s)
+    s.synchronize()
+# s.destroy() called automatically
+```
+
+## PyTorch interop (CUDA)
+
+When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to
+avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different
+streams with no ordering guarantees, leading to intermittent data corruption.
+
+### Running Quadrants kernels on PyTorch's stream
+
+```python
+import torch
+from quadrants.lang.stream import Stream
+
+torch_stream_ptr = torch.cuda.current_stream().cuda_stream
+stream = Stream(torch_stream_ptr)
+
+physics_kernel(qd_stream=stream)
+observations = compute_obs_tensor()  # PyTorch op on the same stream
+apply_actions_kernel(qd_stream=stream)
+```
+
+Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this
+wrapper — PyTorch owns the underlying stream.
+
+### Running PyTorch operations on a Quadrants stream
+
+```python
+qd_stream = qd.create_stream()
+torch_stream = torch.cuda.ExternalStream(qd_stream.handle)
+
+with torch.cuda.stream(torch_stream):
+    physics_kernel(qd_stream=qd_stream)
+    observations = compute_obs_tensor()
+    apply_actions_kernel(qd_stream=qd_stream)
+
+qd_stream.destroy()
+```
+
+`Stream.handle` is the raw `CUstream` pointer, which `torch.cuda.ExternalStream` accepts directly.
+
+## Limitations
+
+- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
+- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one
+  stream's output is another stream's input.

From f2a2596c577235d796fa810a969d902e5dfe7016 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 11:11:04 -0700
Subject: [PATCH 06/39] Reflow stream.py docstrings to 120c line width

---
 python/quadrants/lang/stream.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 8f6cfab3d6..5e54b227cd 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -10,8 +10,8 @@ def _get_prog_weakref():
 class Stream:
     """Wraps a backend-specific GPU stream for concurrent kernel execution.
 
-    On backends without native streams (e.g. CPU), this is a no-op object.
-    Call destroy() explicitly or use as a context manager to ensure cleanup.
+    On backends without native streams (e.g. CPU), this is a no-op object. Call destroy() explicitly or use as
+    a context manager to ensure cleanup.
     """
 
     def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
@@ -54,8 +54,8 @@ def __exit__(self, *args):
 class Event:
     """Wraps a backend-specific GPU event for stream synchronization.
 
-    On backends without native events (e.g. CPU), this is a no-op object.
-    Call destroy() explicitly or use as a context manager to ensure cleanup.
+    On backends without native events (e.g. CPU), this is a no-op object. Call destroy() explicitly or use as
+    a context manager to ensure cleanup.
     """
 
     def __init__(self, handle: int, prog_ref: weakref.ref | None = None):

From de99f3efb295525d5ef1c80b30dc0b0007c97290 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 11:15:31 -0700
Subject: [PATCH 07/39] Unwrap prose lines in streams.md to match repo doc
 style

---
 docs/source/user_guide/streams.md | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0a610fd217..0fb2627c0c 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -1,8 +1,6 @@
 # Streams
 
-Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default
-stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently
-and control synchronization with events.
+Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently and control synchronization with events.
 
 ## Supported platforms
 
@@ -13,8 +11,7 @@ and control synchronization with events.
 | Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 | Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 
-On backends without native stream support, `create_stream()` and `create_event()` return objects with handle
-`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
+On backends without native stream support, `create_stream()` and `create_event()` return objects with handle `0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
 
 ## Creating and using streams
 
@@ -50,8 +47,7 @@ s1.destroy()
 s2.destroy()
 ```
 
-Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute
-concurrently. Call `synchronize()` to block until all work on a stream completes.
+Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes.
 
 ## Events
 
@@ -85,8 +81,7 @@ s1.destroy()
 s2.destroy()
 ```
 
-`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait
-until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
+`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
 
 ## Context managers
 
@@ -101,9 +96,7 @@ with qd.create_stream() as s:
 
 ## PyTorch interop (CUDA)
 
-When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to
-avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different
-streams with no ordering guarantees, leading to intermittent data corruption.
+When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different streams with no ordering guarantees, leading to intermittent data corruption.
 
 ### Running Quadrants kernels on PyTorch's stream
 
@@ -119,8 +112,7 @@ observations = compute_obs_tensor()  # PyTorch op on the same stream
 apply_actions_kernel(qd_stream=stream)
 ```
 
-Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this
-wrapper — PyTorch owns the underlying stream.
+Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this wrapper — PyTorch owns the underlying stream.
 
 ### Running PyTorch operations on a Quadrants stream
 
@@ -141,5 +133,4 @@ qd_stream.destroy()
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
-- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one
-  stream's output is another stream's input.
+- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.

From 401d6f81f0641c73118e1356feb9b87c3480e4f1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:49:13 -0700
Subject: [PATCH 08/39] Use CU_STREAM_NON_BLOCKING for user-created streams

Streams created with CU_STREAM_DEFAULT (flag 0) implicitly synchronize
with the legacy NULL stream, defeating concurrent execution when any
code path (including the kernel launcher's sizer-context block) posts
work on the NULL stream. Switch to CU_STREAM_NON_BLOCKING (0x1) to
match PyTorch/JAX/CuPy conventions and deliver the concurrency the
stream API promises.
---
 quadrants/program/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 83adc99627..a38ddd0dbb 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -498,7 +498,7 @@ uint64 Program::stream_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
     void *stream = nullptr;
-    CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
   }
 #endif

From a3c98f8da17148524f73d6c5faf348337cd7e8a9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:49:40 -0700
Subject: [PATCH 09/39] Use async DtoH memcpy on active_stream for external
 array readback

The post-kernel readback of host-backed external arrays used synchronous
cuMemcpyDtoH which implicitly serializes through the NULL stream,
defeating stream isolation. Switch to memcpy_device_to_host_async on
active_stream with a scoped stream_synchronize, consistent with the
HtoD direction already converted in this branch.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index de6bab83e6..8a33bf0b61 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -253,8 +253,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
-      CUDADriver::get_instance().memcpy_device_to_host(itr->second.first, (void *)device_ptrs[idx],
-                                                       ctx.array_runtime_sizes[idx.arg_id]);
+      CUDADriver::get_instance().memcpy_device_to_host_async(
+          itr->second.first, (void *)device_ptrs[idx],
+          ctx.array_runtime_sizes[idx.arg_id], active_stream);
+    }
+    CUDADriver::get_instance().stream_synchronize(active_stream);
+    for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }

From ca14f6753f35b3feedb4fd2f84e3ca0d3475a1e3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:50:01 -0700
Subject: [PATCH 10/39] Guard destroy()/__exit__ against destroying
 externally-owned handles

Stream.__del__ already checks self._prog_ref is not None to avoid
destroying handles wrapping external streams (e.g. PyTorch), but
destroy() and __exit__ did not. A user doing
`with Stream(torch_stream_ptr): ...` would destroy the PyTorch stream
on block exit. Add the same ownership guard to destroy() for both
Stream and Event.
---
 python/quadrants/lang/stream.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 5e54b227cd..063a2aeafc 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -28,8 +28,11 @@ def synchronize(self):
         prog.stream_synchronize(self._handle)
 
     def destroy(self):
-        """Explicitly destroy the stream. Safe to call multiple times."""
-        if self._handle != 0:
+        """Explicitly destroy the stream. Safe to call multiple times.
+
+        No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
+        """
+        if self._handle != 0 and self._prog_ref is not None:
             prog = impl.get_runtime().prog
             prog.stream_destroy(self._handle)
             self._handle = 0
@@ -84,8 +87,11 @@ def synchronize(self):
         prog.event_synchronize(self._handle)
 
     def destroy(self):
-        """Explicitly destroy the event. Safe to call multiple times."""
-        if self._handle != 0:
+        """Explicitly destroy the event. Safe to call multiple times.
+
+        No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
+        """
+        if self._handle != 0 and self._prog_ref is not None:
             prog = impl.get_runtime().prog
             prog.event_destroy(self._handle)
             self._handle = 0

From b46de06b5c0cc9892cadac4b22812f72d80522d2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:58:15 -0700
Subject: [PATCH 11/39] Fix clang-format indentation for
 memcpy_device_to_host_async

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 8a33bf0b61..f3f48ab21e 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -253,9 +253,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
-      CUDADriver::get_instance().memcpy_device_to_host_async(
-          itr->second.first, (void *)device_ptrs[idx],
-          ctx.array_runtime_sizes[idx.arg_id], active_stream);
+      CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
+                                                             ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
     CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {

From b9eef6e844a6940848a0f4a52c9f5820ef69e388 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:07:11 -0700
Subject: [PATCH 12/39] Use async DtoH on active_stream for do-while loop
 counter readback

The do-while loop counter readback in launch_offloaded_tasks_with_do_while
used synchronous cuMemcpyDtoH which serializes through the NULL stream,
defeating stream isolation on every loop iteration. Switch to async
memcpy on the active stream followed by stream_synchronize, matching
the pattern used elsewhere in the launcher.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index f3f48ab21e..a1ccc470ab 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -71,8 +71,9 @@ void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder &
     launch_offloaded_tasks(ctx, cuda_module, offloaded_tasks, device_context_ptr);
     counter_val = 0;
     auto *stream = CUDAContext::get_instance().get_stream();
+    CUDADriver::get_instance().memcpy_device_to_host_async(&counter_val, ctx.graph_do_while_flag_dev_ptr,
+                                                           sizeof(int32_t), stream);
     CUDADriver::get_instance().stream_synchronize(stream);
-    CUDADriver::get_instance().memcpy_device_to_host(&counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
   } while (counter_val != 0);
 }
 

From f0dd7d6acb648aef15f8bb726ac86a0d0bca9d05 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:07:30 -0700
Subject: [PATCH 13/39] Use active_stream for sizer device context staging

The needs_sizer_device_ctx block (malloc_async, memcpy_host_to_device_async,
mem_free_async) was using nullptr (NULL stream) while the consuming sizer
kernel runs on active_stream. With non-blocking streams (e.g. wrapped
PyTorch streams), there is no implicit ordering between them, creating
a race where the sizer kernel could read stale or freed memory.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index a1ccc470ab..ca27b78dd1 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -227,9 +227,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   needs_sizer_device_ctx = needs_sizer_device_ctx && !CUDAContext::get_instance().supports_pageable_memory_access();
   void *device_context_ptr = nullptr;
   if (needs_sizer_device_ctx) {
-    CUDADriver::get_instance().malloc_async(&device_context_ptr, sizeof(RuntimeContext), nullptr);
+    CUDADriver::get_instance().malloc_async(&device_context_ptr, sizeof(RuntimeContext), active_stream);
     CUDADriver::get_instance().memcpy_host_to_device_async(device_context_ptr, &ctx.get_context(),
-                                                           sizeof(RuntimeContext), nullptr);
+                                                           sizeof(RuntimeContext), active_stream);
   }
 
   if (ctx.graph_do_while_arg_id >= 0) {
@@ -239,7 +239,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     launch_offloaded_tasks(ctx, cuda_module, offloaded_tasks, device_context_ptr);
   }
   if (needs_sizer_device_ctx) {
-    CUDADriver::get_instance().mem_free_async(device_context_ptr, nullptr);
+    CUDADriver::get_instance().mem_free_async(device_context_ptr, active_stream);
   }
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream);

From 8b3d4ed5f513603e1c3066090576cc0d90742329 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:08:04 -0700
Subject: [PATCH 14/39] Add make_current() to stream/event Program methods

All other CUDA entry points (kernel_launcher, jit_cuda, graph_manager)
call CUDAContext::get_instance().make_current() to bind the primary
context on the calling thread. The new stream/event methods skipped
this, which would cause CUDA_ERROR_INVALID_CONTEXT if called from a
thread other than the qd.init thread.
---
 quadrants/program/program.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index a38ddd0dbb..5abcd255b3 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -497,6 +497,7 @@ void Program::enqueue_compute_op_lambda(std::function<void(Device *device, Comma
 uint64 Program::stream_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
     void *stream = nullptr;
     CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
@@ -508,6 +509,7 @@ uint64 Program::stream_create() {
 void Program::stream_destroy(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -516,6 +518,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 void Program::stream_synchronize(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -524,6 +527,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 void Program::set_current_cuda_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
     CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -532,6 +536,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
 uint64 Program::event_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
     void *event = nullptr;
     CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
     return reinterpret_cast<uint64>(event);
@@ -543,6 +548,7 @@ uint64 Program::event_create() {
 void Program::event_destroy(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -551,6 +557,7 @@ void Program::event_destroy(uint64 event_handle) {
 void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                             reinterpret_cast<void *>(stream_handle));
   }
@@ -560,6 +567,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 void Program::event_synchronize(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -568,6 +576,7 @@ void Program::event_synchronize(uint64 event_handle) {
 void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                  reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }

From aa4a70f91983d26fed7c73a380d5a13646997ed2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:52:32 -0700
Subject: [PATCH 15/39] Use async DtoH on active_stream for resolve_num_threads
 readback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

resolve_num_threads reads dynamic range_for begin/end from device
temporaries via synchronous cuMemcpyDtoH (NULL stream). With
CU_STREAM_NON_BLOCKING user streams, the prep task's store on
active_stream has no ordering with the NULL stream, so the readback
can return stale values — leading to wrong adstack sizing and either
CUDA_ERROR_ILLEGAL_ADDRESS or silent gradient corruption. Switch to
async memcpy on active_stream + stream_synchronize, matching the
pattern used at all other DtoH sites in the launcher.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index ca27b78dd1..005ad480e9 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -23,15 +23,17 @@ std::size_t resolve_num_threads(const AdStackSizingInfo &info, LlvmRuntimeExecut
   std::int32_t begin = info.begin_const_value;
   std::int32_t end = info.end_const_value;
   if (info.begin_offset_bytes >= 0 || info.end_offset_bytes >= 0) {
+    auto *active_stream = CUDAContext::get_instance().get_stream();
     auto *temp_dev_ptr = reinterpret_cast<uint8_t *>(executor->get_runtime_temporaries_device_ptr());
     if (info.begin_offset_bytes >= 0) {
-      CUDADriver::get_instance().memcpy_device_to_host(&begin, temp_dev_ptr + info.begin_offset_bytes,
-                                                       sizeof(std::int32_t));
+      CUDADriver::get_instance().memcpy_device_to_host_async(&begin, temp_dev_ptr + info.begin_offset_bytes,
+                                                             sizeof(std::int32_t), active_stream);
     }
     if (info.end_offset_bytes >= 0) {
-      CUDADriver::get_instance().memcpy_device_to_host(&end, temp_dev_ptr + info.end_offset_bytes,
-                                                       sizeof(std::int32_t));
+      CUDADriver::get_instance().memcpy_device_to_host_async(&end, temp_dev_ptr + info.end_offset_bytes,
+                                                             sizeof(std::int32_t), active_stream);
     }
+    CUDADriver::get_instance().stream_synchronize(active_stream);
   }
   // Clamp the logical iteration count to the launched thread count: adstack slices are indexed by
   // `linear_thread_idx()` (`block_idx * block_dim + thread_idx`), so only `static_num_threads = grid_dim *

From 5901a7fc83e7b17a3f6d580449b14db952ccf5d2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:13:32 -0700
Subject: [PATCH 16/39] Sync active_stream at end of launch_llvm_kernel
 unconditionally

The result-buffer DtoH and mem_free_async are queued on active_stream,
but stream_synchronize only ran inside the transfers.size() > 0 branch.
For the ndarray/CUDA-tensor path (transfers empty), the launcher
returned with the DtoH still in flight on a CU_STREAM_NON_BLOCKING
stream. The post-launcher cuStreamSynchronize(NULL) in runtime_ops.sync
does not drain non-blocking streams, so fetch_ret_impl could read stale
bytes. Move the sync to the end of the function unconditionally.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 005ad480e9..d6931da87a 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -259,11 +259,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
       CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
                                                              ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
-    CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
+  CUDADriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From 8550aa012d4759d7e7de0737aabbce86b4c33bf7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:45:20 -0700
Subject: [PATCH 17/39] Fix end-of-launcher sync: conditional + dealloc race

Two fixes in the post-launch cleanup:

1. The unconditional stream_synchronize(active_stream) blocked the host
   on every kernel launch, defeating stream concurrency for the common
   case (no return value, no host-backed arrays). Make it conditional:
   sync only when result_buffer_size > 0 (the stale-bytes path), or
   when transfers are present (already had its own sync).

2. The transfers branch queued async DtoH on active_stream then
   immediately deallocated device memory via mem_free_async(NULL stream).
   With CU_STREAM_NON_BLOCKING streams, the dealloc could race with the
   in-flight DtoH. Add stream_synchronize(active_stream) between the
   DtoH loop and the dealloc loop.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index d6931da87a..b0d2da095c 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -259,11 +259,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
       CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
                                                              ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
+    CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
+  } else if (ctx.result_buffer_size > 0) {
+    CUDADriver::get_instance().stream_synchronize(active_stream);
   }
-  CUDADriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From 6374cf3bfe91f700863af6cd510fe7ed00446f34 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:45:59 -0700
Subject: [PATCH 18/39] Reject qd_stream inside autograd Tape context

The Tape replay path (Tape.grad) calls func.grad(*args) with no kwargs,
so qd_stream is silently dropped and the backward kernel runs on the
default stream with no ordering guarantee relative to the forward on
the user's stream. Raise RuntimeError when both are used, matching the
existing graph=True incompatibility pattern. Document the limitation.
---
 docs/source/user_guide/streams.md | 1 +
 python/quadrants/lang/kernel.py   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0fb2627c0c..85d4e8d12c 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -133,4 +133,5 @@ qd_stream.destroy()
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
+- **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee.
 - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 7c68373f34..8a1004c6a8 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -649,6 +649,9 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     @_shell_pop_print
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
+        if qd_stream is not None and self.runtime.target_tape:
+            raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
+                               "context, or omit qd_stream.")
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()

From ca8ace3b6be6ed794f0f7619f92d3328a61d1e41 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:57:14 -0700
Subject: [PATCH 19/39] Fix linter formatting; guard graph+stream; sync
 has_print on stream

Three changes:

1. Fix black formatting of the Tape+stream RuntimeError.

2. Raise RuntimeError when qd_stream is passed to a graph=True kernel,
   enforcing the documented limitation in streams.md rather than
   silently bypassing the end-of-launcher sync.

3. When a kernel has print statements but no return value, and runs on
   a qd_stream, sync the user stream before runtime_ops.sync(). The
   NULL-stream sync in runtime_ops does not drain CU_STREAM_NON_BLOCKING
   user streams, so CUDA printf buffers would otherwise not be flushed.
---
 python/quadrants/lang/kernel.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 8a1004c6a8..e0cdf945b5 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -561,6 +561,11 @@ def launch_kernel(
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_graph = self.use_graph and _GRAPH_ENABLED
+            if launch_ctx.use_graph and qd_stream is not None:
+                raise RuntimeError(
+                    "qd_stream is not compatible with graph=True kernels. "
+                    "See docs/source/user_guide/streams.md for details."
+                )
             if self.graph_do_while_arg is not None and hasattr(self, "_graph_do_while_cpp_arg_id"):
                 launch_ctx.graph_do_while_arg_id = self._graph_do_while_cpp_arg_id
             stream_handle = qd_stream.handle if qd_stream is not None else 0
@@ -582,6 +587,8 @@ def launch_kernel(
 
         return_type = self.return_type
         if return_type or self.has_print:
+            if qd_stream is not None and self.has_print and not return_type:
+                qd_stream.synchronize()
             runtime_ops.sync()
 
         if not return_type:
@@ -650,8 +657,10 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
         if qd_stream is not None and self.runtime.target_tape:
-            raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
-                               "context, or omit qd_stream.")
+            raise RuntimeError(
+                "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
+                "context, or omit qd_stream."
+            )
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()

From b1c6eea4249b29c530debbba122b502da4619592 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 06:32:06 -0700
Subject: [PATCH 20/39] Sync active_stream before adstack sizer stride readback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

publish_adstack_metadata launches the sizer kernel on active_stream
but reads the computed stride via synchronous copy_d2h (NULL stream).
With CU_STREAM_NON_BLOCKING user streams, the NULL stream does not
wait for the sizer kernel to complete, so the readback can return
stale stride values — sizing the adstack heap incorrectly. Add
stream_synchronize(active_stream) before the D2H.
---
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 69be9408b5..9e2d3c9041 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -943,7 +943,12 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
     runtime_jit->call<void *, void *, void *>("runtime_eval_adstack_size_expr", llvm_runtime_,
                                               runtime_context_ptr_for_sizer, bytecode_dev_ptr);
 
-    // Read back the computed per-thread stride so we can size the heap on host. One 8-byte `DtoH` per launch.
+    // The sizer kernel runs on active_stream; drain it before reading the stride on the host.
+#if defined(QD_WITH_CUDA)
+    if (config_.arch == Arch::cuda) {
+      CUDADriver::get_instance().stream_synchronize(CUDAContext::get_instance().get_stream());
+    }
+#endif
     uint64_t stride_u64 = 0;
     copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
     stride = static_cast<std::size_t>(stride_u64);

From 3c6b24eb4706574a9bb755c335b0e1cda318b35b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 08:50:48 -0700
Subject: [PATCH 21/39] Add tests for stream/event context managers,
 event.synchronize, error paths

Cover the gaps flagged by the test-coverage CI check:
- Stream and Event used as context managers (__enter__/__exit__)
- Event.synchronize() method
- RuntimeError when qd_stream is combined with autograd Tape
- RuntimeError when qd_stream is combined with graph=True

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tests/python/test_streams.py | 78 ++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index fabc217e96..8a00024220 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -1,6 +1,7 @@
 """Tests for GPU stream and event support."""
 
 import numpy as np
+import pytest
 
 import quadrants as qd
 from quadrants.lang.stream import Event, Stream
@@ -195,3 +196,80 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     s.synchronize()
     assert np.allclose(arr.to_numpy(), 99.0)
     s.destroy()
+
+
+@test_utils.test()
+def test_stream_context_manager():
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 11.0
+
+    with qd.create_stream() as s:
+        fill(qd_stream=s)
+        s.synchronize()
+    assert s.handle == 0
+    assert np.allclose(x.to_numpy(), 11.0)
+
+
+@test_utils.test()
+def test_event_context_manager():
+    with qd.create_event() as e:
+        assert isinstance(e, Event)
+    assert e.handle == 0
+
+
+@test_utils.test()
+def test_event_synchronize():
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 13.0
+
+    s = qd.create_stream()
+    fill(qd_stream=s)
+    e = qd.create_event()
+    e.record(s)
+    e.synchronize()
+    assert np.allclose(x.to_numpy(), 13.0)
+    e.destroy()
+    s.destroy()
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_stream_with_tape_raises():
+    x = qd.field(qd.f32, shape=(), needs_grad=True)
+    loss = qd.field(qd.f32, shape=(), needs_grad=True)
+
+    @qd.kernel
+    def compute():
+        loss[None] = x[None] ** 2
+
+    s = qd.create_stream()
+    with pytest.raises(RuntimeError, match="not compatible with autograd Tape"):
+        with qd.ad.Tape(loss):
+            compute(qd_stream=s)
+    s.destroy()
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_stream_with_graph_raises():
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 1.0
+
+    fill.use_graph = True
+    s = qd.create_stream()
+    with pytest.raises(RuntimeError, match="not compatible with graph=True"):
+        fill(qd_stream=s)
+    s.destroy()

From c549e072779fc12f4a33c381d1f76ef6167cd0e7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 09:53:18 -0700
Subject: [PATCH 22/39] Fix graph+stream error guard and test

Check self.use_graph instead of launch_ctx.use_graph so the error
fires even when QD_GRAPH env var is off.  Use @qd.kernel(graph=True)
in the test instead of manually setting .use_graph attribute.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/kernel.py | 2 +-
 tests/python/test_streams.py    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index e0cdf945b5..cb337d1bc1 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -561,7 +561,7 @@ def launch_kernel(
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_graph = self.use_graph and _GRAPH_ENABLED
-            if launch_ctx.use_graph and qd_stream is not None:
+            if self.use_graph and qd_stream is not None:
                 raise RuntimeError(
                     "qd_stream is not compatible with graph=True kernels. "
                     "See docs/source/user_guide/streams.md for details."
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 8a00024220..7f03703dac 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -263,12 +263,11 @@ def test_stream_with_graph_raises():
     N = 64
     x = qd.field(qd.f32, shape=(N,))
 
-    @qd.kernel
+    @qd.kernel(graph=True)
     def fill():
         for i in range(N):
             x[i] = 1.0
 
-    fill.use_graph = True
     s = qd.create_stream()
     with pytest.raises(RuntimeError, match="not compatible with graph=True"):
         fill(qd_stream=s)

From 5d284acf162364a7a1c271647388fe1111a09029 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 09:57:09 -0700
Subject: [PATCH 23/39] Update qd.sync() docstring and streams doc to reflect
 default-stream-only semantics

qd.sync() synchronizes the default (NULL) stream, not explicit
non-blocking streams.  Update the docstring and add a note to the
streams user guide.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/source/user_guide/streams.md    | 1 +
 python/quadrants/lang/runtime_ops.py | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 85d4e8d12c..0f9dbf7496 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -134,4 +134,5 @@ qd_stream.destroy()
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
 - **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee.
+- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for.
 - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.
diff --git a/python/quadrants/lang/runtime_ops.py b/python/quadrants/lang/runtime_ops.py
index 0ecd122f56..8b07cfb99a 100644
--- a/python/quadrants/lang/runtime_ops.py
+++ b/python/quadrants/lang/runtime_ops.py
@@ -4,8 +4,12 @@
 
 
 def sync():
-    """Blocks the calling thread until all the previously
-    launched Quadrants kernels have completed.
+    """Synchronizes the default stream.
+
+    Blocks the calling thread until all work on the default GPU stream
+    has completed.  Kernels launched on explicit streams created via
+    :func:`quadrants.create_stream` are **not** waited on — call
+    ``stream.synchronize()`` for those.
     """
     impl.get_runtime().sync()
 

From ff8056d34acecbb295d40a3a216e33b0fd0ddab8 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 11:17:39 -0700
Subject: [PATCH 24/39] Reflow sync() docstring to 120-char line width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/runtime_ops.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/runtime_ops.py b/python/quadrants/lang/runtime_ops.py
index 8b07cfb99a..71919e2379 100644
--- a/python/quadrants/lang/runtime_ops.py
+++ b/python/quadrants/lang/runtime_ops.py
@@ -6,10 +6,8 @@
 def sync():
     """Synchronizes the default stream.
 
-    Blocks the calling thread until all work on the default GPU stream
-    has completed.  Kernels launched on explicit streams created via
-    :func:`quadrants.create_stream` are **not** waited on — call
-    ``stream.synchronize()`` for those.
+    Blocks the calling thread until all work on the default GPU stream has completed.  Kernels launched on explicit
+    streams created via :func:`quadrants.create_stream` are **not** waited on — call ``stream.synchronize()`` for those.
     """
     impl.get_runtime().sync()
 

From 360adc8fad4e9709f51016fd131686f41679c64e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 01:28:58 -0700
Subject: [PATCH 25/39] Reject qd_stream on autodiff kernels

Streams are not compatible with reverse-mode or forward-mode
differentiation.  The adstack sizer and Tape replay paths assume the
default stream; rather than fixing every race, block the combination
at the Python entry point with a clear error message.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/source/user_guide/streams.md | 2 +-
 python/quadrants/lang/kernel.py   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0f9dbf7496..b4b70b774b 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -133,6 +133,6 @@ qd_stream.destroy()
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
-- **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee.
+- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context.
 - **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for.
 - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index eecf92631a..0b45a5816b 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -664,6 +664,11 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     @_shell_pop_print
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
+        if qd_stream is not None and self.autodiff_mode != _NONE:
+            raise RuntimeError(
+                "qd_stream is not compatible with autodiff kernels. Streams cannot be used with "
+                "reverse-mode or forward-mode differentiation."
+            )
         if qd_stream is not None and self.runtime.target_tape:
             raise RuntimeError(
                 "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "

From e20fe99687dd0f2cfb78a7895414bd481d6f7fa6 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 01:29:55 -0700
Subject: [PATCH 26/39] Revert adstack sizer stream_synchronize

Autodiff+streams is now blocked at the Python level, so the adstack
code path never runs on a non-default stream.  Remove the unnecessary
stream_synchronize we added in publish_adstack_metadata.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 214c12de11..8326335dfb 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -941,12 +941,6 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
     runtime_jit->call<void *, void *, void *>("runtime_eval_adstack_size_expr", llvm_runtime_,
                                               runtime_context_ptr_for_sizer, bytecode_dev_ptr);
 
-    // The sizer kernel runs on active_stream; drain it before reading the stride on the host.
-#if defined(QD_WITH_CUDA)
-    if (config_.arch == Arch::cuda) {
-      CUDADriver::get_instance().stream_synchronize(CUDAContext::get_instance().get_stream());
-    }
-#endif
     uint64_t stride_u64 = 0;
     copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
     stride = static_cast<std::size_t>(stride_u64);

From e3c5f6f59461392be9b16ea76550b278649a8899 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 01:40:06 -0700
Subject: [PATCH 27/39] Reset llvm_runtime_executor.cpp to upstream

Our branch had a stale copy of publish_adstack_metadata and
ensure_adstack_heap that conflicted with upstream's refactor into
ensure_adstack_heap_float / ensure_adstack_heap_int.  Since autodiff
is now blocked with streams at the Python level, we have no changes
to make in this file.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../runtime/llvm/llvm_runtime_executor.cpp    | 425 ------------------
 1 file changed, 425 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 8326335dfb..658c139c0f 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -606,431 +606,6 @@ void *LlvmRuntimeExecutor::get_runtime_temporaries_device_ptr() {
   return runtime_temporaries_cache_;
 }
 
-// Publish the per-task adstack metadata into the LLVMRuntime struct and size the heap. The codegen path loads
-// stride / offset / max_size from these fields at every `AdStack*` site (see `ensure_ad_stack_metadata_llvm` in
-// codegen_llvm.cpp), so we must write them before every launch even for tasks where the compile-time and
-// launch-time bounds agree. `evaluate_adstack_size_expr` is called only when the symbolic tree is available; the
-// offline cache does not currently serialize `SizeExpr`, so cache hits fall back to `max_size_compile_time`.
-std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInfo &ad_stack,
-                                                          std::size_t num_threads,
-                                                          LaunchContextBuilder *ctx,
-                                                          void *device_runtime_context_ptr) {
-  const auto n_stacks = ad_stack.allocas.size();
-  if (n_stacks == 0 || num_threads == 0) {
-    return 0;
-  }
-  auto align_up_8 = [](std::size_t n) -> std::size_t { return (n + 7u) & ~std::size_t{7u}; };
-  // Allocate / grow the two device-side metadata arrays. Capacity is in u64 entries, kept at or above n_stacks.
-  // On GPU these buffers are written exclusively by the device-side sizer kernel (`runtime_eval_adstack_size_expr`);
-  // on CPU the host evaluator writes them directly via `std::memcpy`. Either way the pointers published into
-  // `runtime->adstack_offsets` / `adstack_max_sizes` stay stable across launches unless we grow here.
-  auto grow_to = [&](DeviceAllocationUnique &alloc, std::size_t capacity_u64) {
-    Device::AllocParams params{};
-    params.size = capacity_u64 * sizeof(uint64_t);
-    params.host_read = false;
-    params.host_write = false;
-    params.export_sharing = false;
-    params.usage = AllocUsage::Storage;
-    DeviceAllocation new_alloc;
-    RhiResult res = llvm_device()->allocate_memory(params, &new_alloc);
-    QD_ERROR_IF(res != RhiResult::success, "Failed to allocate {} bytes for adstack metadata array (err: {})",
-                params.size, int(res));
-    alloc = std::make_unique<DeviceAllocationGuard>(std::move(new_alloc));
-  };
-  if (n_stacks > adstack_metadata_capacity_) {
-    std::size_t new_cap = std::max<std::size_t>(n_stacks, 2 * adstack_metadata_capacity_);
-    grow_to(adstack_offsets_alloc_, new_cap);
-    grow_to(adstack_max_sizes_alloc_, new_cap);
-    adstack_metadata_capacity_ = new_cap;
-  }
-  void *offsets_dev_ptr = get_device_alloc_info_ptr(*adstack_offsets_alloc_);
-  void *max_sizes_dev_ptr = get_device_alloc_info_ptr(*adstack_max_sizes_alloc_);
-
-  auto copy_h2d = [&](void *dst, const void *src, std::size_t bytes) {
-    if (config_.arch == Arch::cuda) {
-#if defined(QD_WITH_CUDA)
-      CUDADriver::get_instance().memcpy_host_to_device(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else if (config_.arch == Arch::amdgpu) {
-#if defined(QD_WITH_AMDGPU)
-      AMDGPUDriver::get_instance().memcpy_host_to_device(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else {
-      std::memcpy(dst, src, bytes);
-    }
-  };
-  auto copy_d2h = [&](void *dst, const void *src, std::size_t bytes) {
-    if (config_.arch == Arch::cuda) {
-#if defined(QD_WITH_CUDA)
-      CUDADriver::get_instance().memcpy_device_to_host(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else if (config_.arch == Arch::amdgpu) {
-#if defined(QD_WITH_AMDGPU)
-      AMDGPUDriver::get_instance().memcpy_device_to_host(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else {
-      std::memcpy(dst, src, bytes);
-    }
-  };
-
-  // Cache the runtime-field addresses on the first call; then publish the metadata-array pointers into the
-  // runtime struct. The stride field is written by the sizer on GPU and by this function on CPU, so we cache the
-  // address either way.
-  if (runtime_adstack_stride_field_ptr_ == nullptr) {
-    auto *const runtime_jit = get_runtime_jit_module();
-    runtime_jit->call<void *>("runtime_get_adstack_metadata_field_ptrs", llvm_runtime_);
-    runtime_adstack_stride_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id, result_buffer_cache_));
-    runtime_adstack_offsets_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id + 1, result_buffer_cache_));
-    runtime_adstack_max_sizes_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id + 2, result_buffer_cache_));
-  }
-  copy_h2d(runtime_adstack_offsets_field_ptr_, &offsets_dev_ptr, sizeof(void *));
-  copy_h2d(runtime_adstack_max_sizes_field_ptr_, &max_sizes_dev_ptr, sizeof(void *));
-
-  std::size_t stride = 0;
-  const bool is_gpu_llvm = (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu);
-
-  // Host-eval fast path. The on-device sizer kernel exists to handle one specific leaf, `ExternalTensorRead`,
-  // whose ndarray data lives in GPU-private memory (`cudaMalloc` / `hipMalloc`, no UVA fallback) and thus
-  // cannot be touched from the host. Every other SizeExpr leaf - `Const`, `BoundVariable`,
-  // `ExternalTensorShape`, `FieldLoad` - is host-resolvable through the existing `evaluate_adstack_size_expr`
-  // path, so when the kernel's SizeExprs are all `ExternalTensorRead`-free we can skip the encode + bytecode
-  // h2d + sizer-kernel launch + d2h-stride pipeline entirely and write the metadata directly via `copy_h2d`.
-  // On CUDA the saved `cuMemcpyDtoH` for the per-launch stride readback is the dominant cost: every reverse-
-  // mode kernel launch in a 100-substep test paid one such synchronous DtoH each, and that compound stall
-  // accounted for the bulk of the GPU launch overhead under adstack mode. The condition is computed once per
-  // launch by scanning each stack's `nodes` vector for an `ExternalTensorRead` leaf; the scan is O(total
-  // SizeExpr nodes), well below the cost of the cheapest h2d / d2h on any LLVM GPU backend.
-  bool all_size_exprs_host_resolvable = true;
-  for (std::size_t i = 0; i < n_stacks && all_size_exprs_host_resolvable; ++i) {
-    if (i >= ad_stack.size_exprs.size()) {
-      continue;
-    }
-    for (const auto &node : ad_stack.size_exprs[i].nodes) {
-      if (static_cast<SizeExpr::Kind>(node.kind) == SizeExpr::Kind::ExternalTensorRead) {
-        all_size_exprs_host_resolvable = false;
-        break;
-      }
-    }
-  }
-  const bool use_host_eval = !is_gpu_llvm || all_size_exprs_host_resolvable;
-  if (use_host_eval) {
-    // CPU + GPU-without-ExternalTensorRead path: run the host evaluator directly. On CPU we use synchronous
-    // `copy_h2d` (just `std::memcpy` for that arch), but on CUDA / AMDGPU we ship the same payload through
-    // pinned-host memory via async `cuMemcpyHtoDAsync` / `hipMemcpyHtoDAsync` so the host returns immediately
-    // after queueing the copies on the default stream and the subsequent main-kernel launch (also on the
-    // default stream) stream-orders after the copies. The synchronous `cuMemcpyHtoD_v2` path used to block
-    // the host on every one of the three writes we issue per launch; with thousands of reverse-mode launches
-    // per `test_differentiable_rigid` run, those serial host stalls were a measurable fraction of wallclock.
-    // `FieldLoad` is serviced by `SNodeRwAccessorsBank` regardless of arch.
-    // Guard `program_impl_->program` lookups against the C++-only-tests setup where `program_impl_` itself is null;
-    // the on-device branch below already does this and falls back to `max_size_compile_time`.
-    Program *prog = (program_impl_ != nullptr) ? program_impl_->program : nullptr;
-    std::vector<uint64_t> host_max_sizes(n_stacks);
-    for (std::size_t i = 0; i < n_stacks; ++i) {
-      const SerializedSizeExpr *expr = (i < ad_stack.size_exprs.size()) ? &ad_stack.size_exprs[i] : nullptr;
-      int64_t v = -1;
-      if (expr != nullptr && !expr->nodes.empty() && prog != nullptr) {
-        v = evaluate_adstack_size_expr(*expr, prog, ctx);
-      }
-      if (v < 0) {
-        v = static_cast<int64_t>(ad_stack.allocas[i].max_size_compile_time);
-      }
-      host_max_sizes[i] = static_cast<uint64_t>(std::max<int64_t>(v, 1));
-    }
-    std::vector<uint64_t> host_offsets(n_stacks);
-    for (std::size_t i = 0; i < n_stacks; ++i) {
-      host_offsets[i] = stride;
-      stride += align_up_8(sizeof(int64_t) + ad_stack.allocas[i].entry_size_bytes * host_max_sizes[i]);
-    }
-    uint64_t stride_u64 = static_cast<uint64_t>(stride);
-    if (!is_gpu_llvm) {
-      copy_h2d(offsets_dev_ptr, host_offsets.data(), n_stacks * sizeof(uint64_t));
-      copy_h2d(max_sizes_dev_ptr, host_max_sizes.data(), n_stacks * sizeof(uint64_t));
-      copy_h2d(runtime_adstack_stride_field_ptr_, &stride_u64, sizeof(uint64_t));
-    } else {
-      // Three-block payload packed into the pinned-host scratch as `[stride_u64, offsets[n_stacks],
-      // max_sizes[n_stacks]]`. Three async DMAs land on the three target device addresses (the runtime
-      // struct's stride field, the offsets storage buffer, the max_sizes storage buffer) sourced from
-      // the corresponding offsets within the pinned scratch. The driver's H2D DMA engine reads from the
-      // pinned bytes at execution time, so we must not overwrite the scratch before all three copies
-      // have completed - hence the per-launch `event_record` after the last copy and the
-      // `event_synchronize` at the top of the next launch. The wait is typically a no-op because a few
-      // microseconds of small copies finish well before the host returns, dispatches the main kernel,
-      // and re-enters this function on the next launch.
-      const std::size_t header_bytes = sizeof(uint64_t);
-      const std::size_t array_bytes = n_stacks * sizeof(uint64_t);
-      const std::size_t total_bytes = header_bytes + 2 * array_bytes;
-
-      auto wait_pending = [this]() {
-        if (!pinned_metadata_event_pending_) {
-          return;
-        }
-#if defined(QD_WITH_CUDA)
-        if (config_.arch == Arch::cuda) {
-          CUDADriver::get_instance().event_synchronize(pinned_metadata_event_);
-        }
-#endif
-#if defined(QD_WITH_AMDGPU)
-        if (config_.arch == Arch::amdgpu) {
-          AMDGPUDriver::get_instance().event_synchronize(pinned_metadata_event_);
-        }
-#endif
-        pinned_metadata_event_pending_ = false;
-      };
-
-      // Grow / first-allocate the pinned host scratch and the per-launch completion event. Doubling growth
-      // means the pinned alloc / free traffic is amortised to O(log peak_total_bytes) across a run.
-      if (total_bytes > pinned_metadata_scratch_capacity_) {
-        wait_pending();
-        if (pinned_metadata_scratch_ != nullptr) {
-#if defined(QD_WITH_CUDA)
-          if (config_.arch == Arch::cuda) {
-            CUDADriver::get_instance().mem_free_host(pinned_metadata_scratch_);
-          }
-#endif
-#if defined(QD_WITH_AMDGPU)
-          if (config_.arch == Arch::amdgpu) {
-            AMDGPUDriver::get_instance().mem_free_host(pinned_metadata_scratch_);
-          }
-#endif
-          pinned_metadata_scratch_ = nullptr;
-        }
-        std::size_t new_capacity = std::max<std::size_t>(total_bytes, 2 * pinned_metadata_scratch_capacity_);
-#if defined(QD_WITH_CUDA)
-        if (config_.arch == Arch::cuda) {
-          CUDADriver::get_instance().mem_alloc_host(&pinned_metadata_scratch_, new_capacity);
-        }
-#endif
-#if defined(QD_WITH_AMDGPU)
-        if (config_.arch == Arch::amdgpu) {
-          // `hipHostMallocDefault == 0`. Coherent / portable / write-combined flags are intentionally not set;
-          // the workload is small payloads written linearly by the host and DMA-read by the GPU once.
-          AMDGPUDriver::get_instance().mem_alloc_host(&pinned_metadata_scratch_, new_capacity, 0u);
-        }
-#endif
-        pinned_metadata_scratch_capacity_ = new_capacity;
-      }
-      if (pinned_metadata_event_ == nullptr) {
-        // `cuEventCreate` flag `0` (CU_EVENT_DEFAULT) means timing-enabled, which the driver costs us nothing
-        // to set up here and lets future profilers attach without re-creating the event. `hipEventCreateWithFlags`
-        // takes the same encoding.
-#if defined(QD_WITH_CUDA)
-        if (config_.arch == Arch::cuda) {
-          CUDADriver::get_instance().event_create(&pinned_metadata_event_, 0u);
-        }
-#endif
-#if defined(QD_WITH_AMDGPU)
-        if (config_.arch == Arch::amdgpu) {
-          AMDGPUDriver::get_instance().event_create(&pinned_metadata_event_, 0u);
-        }
-#endif
-      }
-      // Block until any in-flight copies from the previous launch have finished pulling from the pinned scratch
-      // before we overwrite it. In steady state this is a no-op because the small DMAs finish well before the
-      // host loops back here; the wait exists only to defend against an unusual interleaving where the GPU
-      // queue is backlogged and the next launch enters this function before the previous launch's last copy
-      // has been consumed.
-      wait_pending();
-
-      auto *pinned = static_cast<uint64_t *>(pinned_metadata_scratch_);
-      pinned[0] = stride_u64;
-      std::memcpy(pinned + 1, host_offsets.data(), array_bytes);
-      std::memcpy(pinned + 1 + n_stacks, host_max_sizes.data(), array_bytes);
-
-      // Queue the metadata copies on the same stream the subsequent main-kernel dispatch will run on, so the
-      // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. On CUDA the active
-      // stream is `CUDAContext::get_instance().get_stream()` - configurable via `set_stream`, defaults to the
-      // null stream - and `CUDAContext::launch` dispatches kernels on the same handle. AMDGPU has no
-      // public stream-selection API: `AMDGPUContext::launch` always passes `nullptr` to `hipLaunchKernel`
-      // (i.e. the default stream), so the copies match that.
-#if defined(QD_WITH_CUDA)
-      if (config_.arch == Arch::cuda) {
-        void *active_stream = CUDAContext::get_instance().get_stream();
-        CUDADriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned, header_bytes,
-                                                               active_stream);
-        CUDADriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes, active_stream);
-        CUDADriver::get_instance().memcpy_host_to_device_async(max_sizes_dev_ptr, pinned + 1 + n_stacks, array_bytes,
-                                                               active_stream);
-        CUDADriver::get_instance().event_record(pinned_metadata_event_, active_stream);
-      }
-#endif
-#if defined(QD_WITH_AMDGPU)
-      if (config_.arch == Arch::amdgpu) {
-        void *active_stream = nullptr;  // AMDGPUContext::launch always uses the default stream.
-        AMDGPUDriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned,
-                                                                 header_bytes, active_stream);
-        AMDGPUDriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes,
-                                                                 active_stream);
-        AMDGPUDriver::get_instance().memcpy_host_to_device_async(max_sizes_dev_ptr, pinned + 1 + n_stacks, array_bytes,
-                                                                 active_stream);
-        AMDGPUDriver::get_instance().event_record(pinned_metadata_event_, active_stream);
-      }
-#endif
-      pinned_metadata_event_pending_ = true;
-    }
-  } else {
-    // GPU (CUDA / AMDGPU): encode the SizeExpr trees into device bytecode, upload, launch the sizer runtime
-    // function, read back just the computed stride. The sizer kernel writes `adstack_max_sizes[]`,
-    // `adstack_offsets[]`, and `adstack_per_thread_stride` directly into the runtime struct and the metadata
-    // arrays above - no further host-writes to those fields are needed this launch.
-    //
-    // Why this architecture rather than host-eval: on CUDA / AMDGPU the ndarray data lives in GPU-private memory
-    // (plain `cudaMalloc` / `hipMalloc`, not managed / unified), so the host evaluator's `ExternalTensorRead`
-    // deref reads garbage. Moving the interpreter on-device keeps the pointer semantics intact - it reads the
-    // data pointer out of `ctx->arg_buffer` (which the kernel will read too) and dereferences it where the
-    // memory lives, with no migration / readback of the ndarray payload itself.
-    std::vector<uint8_t> bytecode;
-    if (program_impl_ != nullptr && program_impl_->program != nullptr) {
-      bytecode = encode_adstack_size_expr_device_bytecode(ad_stack, program_impl_->program, ctx);
-    } else {
-      // No program attached (rare: C++-only tests that construct Program without a full runtime). Fall through
-      // to compile-time bounds by emitting an empty-tree bytecode - the device interpreter sees
-      // `root_node_idx == -1` for every stack and routes to `max_size_compile_time`.
-      bytecode = encode_adstack_size_expr_device_bytecode(ad_stack, nullptr, ctx);
-    }
-    // Grow the scratch buffer if the bytecode outgrew the cached capacity. Amortised doubling keeps the
-    // allocation traffic O(log max_bytecode_bytes) across a run.
-    const std::size_t bytecode_bytes = bytecode.size();
-    if (bytecode_bytes > adstack_sizer_bytecode_capacity_) {
-      std::size_t new_cap = std::max<std::size_t>(bytecode_bytes, 2 * adstack_sizer_bytecode_capacity_);
-      Device::AllocParams params{};
-      params.size = new_cap;
-      params.host_read = false;
-      params.host_write = false;
-      params.export_sharing = false;
-      params.usage = AllocUsage::Storage;
-      DeviceAllocation new_alloc;
-      RhiResult res = llvm_device()->allocate_memory(params, &new_alloc);
-      QD_ERROR_IF(res != RhiResult::success,
-                  "Failed to allocate {} bytes for the adstack sizer bytecode scratch buffer (err: {})", params.size,
-                  int(res));
-      adstack_sizer_bytecode_alloc_ = std::make_unique<DeviceAllocationGuard>(std::move(new_alloc));
-      adstack_sizer_bytecode_capacity_ = new_cap;
-    }
-    void *bytecode_dev_ptr = get_device_alloc_info_ptr(*adstack_sizer_bytecode_alloc_);
-    copy_h2d(bytecode_dev_ptr, bytecode.data(), bytecode_bytes);
-
-    // Invoke the device interpreter. On CUDA / AMDGPU `JITModule::call` launches this as a single-thread kernel
-    // on the default stream and stream-orders it before the subsequent main-kernel dispatch, so the writes we
-    // do here are visible by the time the user's kernel reads `adstack_max_sizes` etc.
-    //
-    // The sizer kernel dereferences `ctx->arg_buffer` on device (that's how it resolves `ExternalTensorRead` leaves
-    // against ndarray pointers the caller packed into the arg buffer). AMDGPU always stages a device-side copy of
-    // `RuntimeContext` because HIP has no UVA fallback and the host pointer faults with `hipErrorIllegalAddress`. CUDA
-    // stages the device copy only when the driver + kernel do not expose HMM / system-allocated memory (queried via
-    // `CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`): CUDA UVA covers pinned / CUDA-managed memory only, not the plain
-    // `std::make_unique<RuntimeContext>()` backing, so a host pointer works on HMM-capable setups but faults otherwise
-    // (Turing without HMM, Windows, pre-535 Linux drivers) as `CUDA_ERROR_ILLEGAL_ADDRESS` at the next DtoH sync
-    // `illegal memory access ... while calling memcpy_device_to_host`. When the caller passes `nullptr` (HMM-capable
-    // CUDA) we fall back to the host pointer; the launcher gates the allocation so HMM-equipped setups pay no staging
-    // cost.
-    auto *const runtime_jit = get_runtime_jit_module();
-    void *runtime_context_ptr_for_sizer =
-        device_runtime_context_ptr != nullptr ? device_runtime_context_ptr : static_cast<void *>(&ctx->get_context());
-    runtime_jit->call<void *, void *, void *>("runtime_eval_adstack_size_expr", llvm_runtime_,
-                                              runtime_context_ptr_for_sizer, bytecode_dev_ptr);
-
-    uint64_t stride_u64 = 0;
-    copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
-    stride = static_cast<std::size_t>(stride_u64);
-  }
-
-  std::size_t needed_bytes = stride * num_threads;
-  ensure_adstack_heap(needed_bytes);
-  return needed_bytes;
-}
-
-void LlvmRuntimeExecutor::ensure_adstack_heap(std::size_t needed_bytes) {
-  if (needed_bytes == 0 || needed_bytes <= adstack_heap_size_) {
-    return;
-  }
-  // Amortized doubling keeps the number of re-allocations across a run bounded by log(peak_size).
-  std::size_t new_size = std::max(needed_bytes, std::size_t(2) * adstack_heap_size_);
-
-  Device::AllocParams params{};
-  params.size = new_size;
-  params.host_read = false;
-  params.host_write = false;
-  params.export_sharing = false;
-  params.usage = AllocUsage::Storage;
-  DeviceAllocation new_alloc;
-  RhiResult res = llvm_device()->allocate_memory(params, &new_alloc);
-  QD_ERROR_IF(res != RhiResult::success,
-              "Failed to allocate {} bytes for the adstack heap (err: {}). Consider lowering `ad_stack_size` or the "
-              "per-kernel reverse-mode adstack count.",
-              new_size, int(res));
-  // `get_device_alloc_info_ptr` is the RHI-agnostic accessor that returns the raw host-visible
-  // pointer on CPU and the device-visible pointer on CUDA / AMDGPU (`get_memory_addr` is only
-  // implemented on the GPU devices, so we route through this helper instead).
-  void *new_ptr = get_device_alloc_info_ptr(new_alloc);
-
-  auto new_guard = std::make_unique<DeviceAllocationGuard>(std::move(new_alloc));
-
-  // Publish the new buffer pointer and size into the runtime struct. On CPU the runtime lives in host memory,
-  // so plain stores through the cached field pointers are correct. On CUDA / AMDGPU the runtime lives in device
-  // memory, so the host writes via the driver's host->device memcpy. The field-address query runs exactly once,
-  // on the first grow, and caches the two device pointers; every subsequent grow is just two 8-byte memcpys.
-  if (runtime_adstack_heap_buffer_field_ptr_ == nullptr) {
-    auto *const runtime_jit = get_runtime_jit_module();
-    runtime_jit->call<void *>("runtime_get_adstack_heap_field_ptrs", llvm_runtime_);
-    runtime_adstack_heap_buffer_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id, result_buffer_cache_));
-    runtime_adstack_heap_size_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id + 1, result_buffer_cache_));
-  }
-  uint64 size_u64 = static_cast<uint64>(new_size);
-  if (config_.arch == Arch::cuda) {
-#if defined(QD_WITH_CUDA)
-    CUDADriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_buffer_field_ptr_, &new_ptr, sizeof(void *));
-    CUDADriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_size_field_ptr_, &size_u64, sizeof(uint64));
-#else
-    QD_NOT_IMPLEMENTED;
-#endif
-  } else if (config_.arch == Arch::amdgpu) {
-#if defined(QD_WITH_AMDGPU)
-    AMDGPUDriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_buffer_field_ptr_, &new_ptr,
-                                                       sizeof(void *));
-    AMDGPUDriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_size_field_ptr_, &size_u64, sizeof(uint64));
-#else
-    QD_NOT_IMPLEMENTED;
-#endif
-  } else {
-    *reinterpret_cast<void **>(runtime_adstack_heap_buffer_field_ptr_) = new_ptr;
-    *reinterpret_cast<uint64 *>(runtime_adstack_heap_size_field_ptr_) = size_u64;
-  }
-
-  // Replace and release the old allocation. `DeviceAllocationGuard`'s destructor calls
-  // `llvm_device()->dealloc_memory`. The new slab has already been handed to `new_guard` above, so the move-assignment
-  // here is what destroys the *previous* guard - the new allocation is not the one being freed. Safety of the release
-  // depends on the backend:
-  //   - CPU: host `std::free`. No GPU involved, always safe.
-  //   - CUDA: `CudaDevice::dealloc_memory` routes through `DeviceMemoryPool::release(release_raw=true)` ->
-  //     `cuMemFree_v2`, which synchronizes with pending device work before returning.
-  //   - AMDGPU: `AmdgpuDevice::dealloc_memory` routes through `DeviceMemoryPool::release(release_raw=false)` ->
-  //     `CachingAllocator::release`, which pools the allocation *without* calling `hipFree` and *without*
-  //     synchronizing. The physical memory stays mapped, so an in-flight kernel still holding the old base pointer
-  //     keeps reading/writing valid storage. The cross-launch safety invariant for AMDGPU comes from
-  //     `amdgpu::KernelLauncher::launch_llvm_kernel` ending with `hipFree(context_pointer)`, which synchronizes
-  //     with all in-flight kernels launched during that call. By the time the *next* `launch_llvm_kernel` reaches
-  //     `ensure_adstack_heap` and can destroy the previous guard, no GPU kernel from the prior call is still
-  //     referencing the old slab. CUDA does not need this extra hop -- the `cuMemFree_v2` in the bullet above
-  //     already syncs -- and the CUDA launcher correspondingly does not allocate a device-side `context_pointer`
-  //     (it passes the `RuntimeContext` by host reference).
-  adstack_heap_alloc_ = std::move(new_guard);
-  adstack_heap_size_ = new_size;
-}
-
 void LlvmRuntimeExecutor::preallocate_runtime_memory() {
   if (preallocated_runtime_memory_allocs_ != nullptr)
     return;

From f6fee4fbd2bcf3040b9edae6970294ed9daca671 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 02:21:09 -0700
Subject: [PATCH 28/39] Add test for qd_stream + autodiff kernel error guard

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tests/python/test_streams.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 7f03703dac..db7588aaf7 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -258,6 +258,21 @@ def compute():
     s.destroy()
 
 
+@test_utils.test(arch=[qd.cuda])
+def test_stream_with_autodiff_kernel_raises():
+    x = qd.field(qd.f32, shape=(), needs_grad=True)
+    loss = qd.field(qd.f32, shape=(), needs_grad=True)
+
+    @qd.kernel
+    def compute():
+        loss[None] = x[None] ** 2
+
+    s = qd.create_stream()
+    with pytest.raises(RuntimeError, match="not compatible with autodiff"):
+        compute.grad(qd_stream=s)
+    s.destroy()
+
+
 @test_utils.test(arch=[qd.cuda])
 def test_stream_with_graph_raises():
     N = 64

From 9fd8b7b9d718948f09f1c4335bd7127946f20d16 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 04:55:19 -0700
Subject: [PATCH 29/39] Extract stream/event methods from program.cpp into
 program_stream.cpp

Move the 9 CUDA-only stream/event Program methods into a dedicated
translation unit.  The CMake glob on quadrants/program/* picks up
the new file automatically.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program.cpp        |  94 ------------------------
 quadrants/program/program_stream.cpp | 103 +++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 94 deletions(-)
 create mode 100644 quadrants/program/program_stream.cpp

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 5abcd255b3..8f6fdb2186 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -20,11 +20,6 @@
 #include "quadrants/codegen/llvm/struct_llvm.h"
 #endif
 
-#ifdef QD_WITH_CUDA
-#include "quadrants/rhi/cuda/cuda_driver.h"
-#include "quadrants/rhi/cuda/cuda_context.h"
-#endif
-
 #ifdef QD_WITH_VULKAN
 #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h"
 #include "quadrants/rhi/vulkan/vulkan_loader.h"
@@ -494,93 +489,4 @@ void Program::enqueue_compute_op_lambda(std::function<void(Device *device, Comma
   program_impl_->enqueue_compute_op_lambda(op, image_refs);
 }
 
-uint64 Program::stream_create() {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().make_current();
-    void *stream = nullptr;
-    CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
-    return reinterpret_cast<uint64>(stream);
-  }
-#endif
-  return 0;
-}
-
-void Program::stream_destroy(uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-void Program::stream_synchronize(uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-void Program::set_current_cuda_stream(uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().make_current();
-    CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-uint64 Program::event_create() {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().make_current();
-    void *event = nullptr;
-    CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
-    return reinterpret_cast<uint64>(event);
-  }
-#endif
-  return 0;
-}
-
-void Program::event_destroy(uint64 event_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
-  }
-#endif
-}
-
-void Program::event_record(uint64 event_handle, uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
-                                            reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-void Program::event_synchronize(uint64 event_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
-  }
-#endif
-}
-
-void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
-                                                 reinterpret_cast<void *>(event_handle), 0 /*flags*/);
-  }
-#endif
-}
-
 }  // namespace quadrants::lang
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
new file mode 100644
index 0000000000..b20252ddbc
--- /dev/null
+++ b/quadrants/program/program_stream.cpp
@@ -0,0 +1,103 @@
+// Stream and event operations for the Program class.
+// Extracted from program.cpp to keep backend-specific GPU stream/event
+// lifecycle code separate from the core Program logic.
+
+#include "program.h"
+
+#ifdef QD_WITH_CUDA
+#include "quadrants/rhi/cuda/cuda_driver.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+#endif
+
+namespace quadrants::lang {
+
+uint64 Program::stream_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
+    void *stream = nullptr;
+    CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
+    return reinterpret_cast<uint64>(stream);
+  }
+#endif
+  return 0;
+}
+
+void Program::stream_destroy(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::stream_synchronize(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::set_current_cuda_stream(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
+    CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+uint64 Program::event_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
+    void *event = nullptr;
+    CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
+    return reinterpret_cast<uint64>(event);
+  }
+#endif
+  return 0;
+}
+
+void Program::event_destroy(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::event_record(uint64 event_handle, uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
+                                            reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::event_synchronize(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
+                                                 reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+  }
+#endif
+}
+
+}  // namespace quadrants::lang

From 9e6f865cfb29b78e5c99705b84e3a6a1bc80bc86 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:26:10 -0700
Subject: [PATCH 30/39] Introduce StreamManager delegate class for stream/event
 ops

Move the CUDA stream/event logic into a StreamManager class
(program_stream.h/.cpp).  Program keeps its public API unchanged
and delegates to stream_manager_ internally, so the pybind layer
and Python code need no changes.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program.cpp        |  1 +
 quadrants/program/program.h          |  2 +
 quadrants/program/program_stream.cpp | 77 ++++++++++++++++++++--------
 quadrants/program/program_stream.h   | 31 +++++++++++
 4 files changed, 90 insertions(+), 21 deletions(-)
 create mode 100644 quadrants/program/program_stream.h

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 8f6fdb2186..ff9901add5 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -60,6 +60,7 @@ Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) {
   config = default_compile_config;
   config.arch = desired_arch;
   config.fit();
+  stream_manager_ = StreamManager(config.arch);
 
   profiler = make_profiler(config.arch, config.kernel_profiler);
   if (arch_uses_llvm(config.arch)) {
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index fe2f30ca74..7fb6019026 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -21,6 +21,7 @@
 #include "quadrants/program/kernel_profiler.h"
 #include "quadrants/program/snode_expr_utils.h"
 #include "quadrants/program/snode_rw_accessors_bank.h"
+#include "quadrants/program/program_stream.h"
 #include "quadrants/program/context.h"
 #include "quadrants/struct/snode_tree.h"
 #include "quadrants/system/threading.h"
@@ -338,6 +339,7 @@ class QD_DLL_EXPORT Program {
 
  private:
   CompileConfig compile_config_;
+  StreamManager stream_manager_{Arch::x64};  // re-initialized in constructor after arch is known
 
   uint64 ndarray_writer_counter_{0};
   uint64 ndarray_reader_counter_{0};
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index b20252ddbc..442e0cfa8d 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -1,7 +1,6 @@
-// Stream and event operations for the Program class.
-// Extracted from program.cpp to keep backend-specific GPU stream/event
-// lifecycle code separate from the core Program logic.
+// StreamManager implementation and Program delegation.
 
+#include "program_stream.h"
 #include "program.h"
 
 #ifdef QD_WITH_CUDA
@@ -11,9 +10,13 @@
 
 namespace quadrants::lang {
 
-uint64 Program::stream_create() {
+// ---------------------------------------------------------------------------
+// StreamManager
+// ---------------------------------------------------------------------------
+
+uint64 StreamManager::create_stream() {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     void *stream = nullptr;
     CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
@@ -23,36 +26,36 @@ uint64 Program::stream_create() {
   return 0;
 }
 
-void Program::stream_destroy(uint64 stream_handle) {
+void StreamManager::destroy_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+  if (arch_ == Arch::cuda && stream_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
 
-void Program::stream_synchronize(uint64 stream_handle) {
+void StreamManager::synchronize_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+  if (arch_ == Arch::cuda && stream_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
 
-void Program::set_current_cuda_stream(uint64 stream_handle) {
+void StreamManager::set_current_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
 
-uint64 Program::event_create() {
+uint64 StreamManager::create_event() {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     void *event = nullptr;
     CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
@@ -62,18 +65,18 @@ uint64 Program::event_create() {
   return 0;
 }
 
-void Program::event_destroy(uint64 event_handle) {
+void StreamManager::destroy_event(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
 
-void Program::event_record(uint64 event_handle, uint64 stream_handle) {
+void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                             reinterpret_cast<void *>(stream_handle));
@@ -81,18 +84,18 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #endif
 }
 
-void Program::event_synchronize(uint64 event_handle) {
+void StreamManager::synchronize_event(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
 
-void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
+void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                  reinterpret_cast<void *>(event_handle), 0 /*flags*/);
@@ -100,4 +103,36 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #endif
 }
 
+// ---------------------------------------------------------------------------
+// Program delegation — keeps the pybind / Python API unchanged.
+// ---------------------------------------------------------------------------
+
+uint64 Program::stream_create() {
+  return stream_manager_.create_stream();
+}
+void Program::stream_destroy(uint64 h) {
+  stream_manager_.destroy_stream(h);
+}
+void Program::stream_synchronize(uint64 h) {
+  stream_manager_.synchronize_stream(h);
+}
+void Program::set_current_cuda_stream(uint64 h) {
+  stream_manager_.set_current_stream(h);
+}
+uint64 Program::event_create() {
+  return stream_manager_.create_event();
+}
+void Program::event_destroy(uint64 h) {
+  stream_manager_.destroy_event(h);
+}
+void Program::event_record(uint64 eh, uint64 sh) {
+  stream_manager_.record_event(eh, sh);
+}
+void Program::event_synchronize(uint64 h) {
+  stream_manager_.synchronize_event(h);
+}
+void Program::stream_wait_event(uint64 sh, uint64 eh) {
+  stream_manager_.stream_wait_event(sh, eh);
+}
+
 }  // namespace quadrants::lang
diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
new file mode 100644
index 0000000000..ae6b7221d5
--- /dev/null
+++ b/quadrants/program/program_stream.h
@@ -0,0 +1,31 @@
+// StreamManager — manages CUDA stream and event lifecycle.
+// Isolated from Program so that backend-specific GPU plumbing
+// does not pollute the core Program interface.
+
+#pragma once
+
+#include "quadrants/common/core.h"
+#include "quadrants/util/lang_util.h"
+
+namespace quadrants::lang {
+
+class StreamManager {
+ public:
+  explicit StreamManager(Arch arch) : arch_(arch) {}
+
+  uint64 create_stream();
+  void destroy_stream(uint64 stream_handle);
+  void synchronize_stream(uint64 stream_handle);
+  void set_current_stream(uint64 stream_handle);
+
+  uint64 create_event();
+  void destroy_event(uint64 event_handle);
+  void record_event(uint64 event_handle, uint64 stream_handle);
+  void synchronize_event(uint64 event_handle);
+  void stream_wait_event(uint64 stream_handle, uint64 event_handle);
+
+ private:
+  Arch arch_;
+};
+
+}  // namespace quadrants::lang

From 84ba5b05b7d9d502eccace8f52e88ea9df0ccbc6 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:34:12 -0700
Subject: [PATCH 31/39] Fix clang-format in program_stream.h

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
index ae6b7221d5..54a8e88d0b 100644
--- a/quadrants/program/program_stream.h
+++ b/quadrants/program/program_stream.h
@@ -11,7 +11,8 @@ namespace quadrants::lang {
 
 class StreamManager {
  public:
-  explicit StreamManager(Arch arch) : arch_(arch) {}
+  explicit StreamManager(Arch arch) : arch_(arch) {
+  }
 
   uint64 create_stream();
   void destroy_stream(uint64 stream_handle);

From b1b4ee60b298aa3e7ea93903c3895dd5a59cf155 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:36:58 -0700
Subject: [PATCH 32/39] Remove Program wrapper methods, bind StreamManager
 directly via pybind
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add stream_manager() accessor to Program.  Update export_lang.cpp
to call StreamManager methods through lambdas.  Delete the 9
one-line delegation methods from Program — the declarations in
program.h and definitions in program_stream.cpp are both gone.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program.h          | 12 +++-------
 quadrants/program/program_stream.cpp | 33 ----------------------------
 quadrants/python/export_lang.cpp     | 18 +++++++--------
 3 files changed, 12 insertions(+), 51 deletions(-)

diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 7fb6019026..600533f1cf 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -320,15 +320,9 @@ class QD_DLL_EXPORT Program {
     return ndarrays_.size();
   }
 
-  uint64 stream_create();
-  void stream_destroy(uint64 stream_handle);
-  void stream_synchronize(uint64 stream_handle);
-  void set_current_cuda_stream(uint64 stream_handle);
-  uint64 event_create();
-  void event_destroy(uint64 event_handle);
-  void event_record(uint64 event_handle, uint64 stream_handle);
-  void event_synchronize(uint64 event_handle);
-  void stream_wait_event(uint64 stream_handle, uint64 event_handle);
+  StreamManager &stream_manager() {
+    return stream_manager_;
+  }
 
   // TODO(zhanlue): Move these members and corresponding interfaces to
   // ProgramImpl Ideally, Program should serve as a pure interface class and all
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index 442e0cfa8d..b1c2429dd6 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -1,7 +1,6 @@
 // StreamManager implementation and Program delegation.
 
 #include "program_stream.h"
-#include "program.h"
 
 #ifdef QD_WITH_CUDA
 #include "quadrants/rhi/cuda/cuda_driver.h"
@@ -103,36 +102,4 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle)
 #endif
 }
 
-// ---------------------------------------------------------------------------
-// Program delegation — keeps the pybind / Python API unchanged.
-// ---------------------------------------------------------------------------
-
-uint64 Program::stream_create() {
-  return stream_manager_.create_stream();
-}
-void Program::stream_destroy(uint64 h) {
-  stream_manager_.destroy_stream(h);
-}
-void Program::stream_synchronize(uint64 h) {
-  stream_manager_.synchronize_stream(h);
-}
-void Program::set_current_cuda_stream(uint64 h) {
-  stream_manager_.set_current_stream(h);
-}
-uint64 Program::event_create() {
-  return stream_manager_.create_event();
-}
-void Program::event_destroy(uint64 h) {
-  stream_manager_.destroy_event(h);
-}
-void Program::event_record(uint64 eh, uint64 sh) {
-  stream_manager_.record_event(eh, sh);
-}
-void Program::event_synchronize(uint64 h) {
-  stream_manager_.synchronize_event(h);
-}
-void Program::stream_wait_event(uint64 sh, uint64 eh) {
-  stream_manager_.stream_wait_event(sh, eh);
-}
-
 }  // namespace quadrants::lang
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index 8cfdd78b5a..c46d40ac10 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -406,15 +406,15 @@ void export_lang(py::module &m) {
       .def("compile_kernel", &Program::compile_kernel, py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
       .def("get_device_caps", &Program::get_device_caps)
-      .def("stream_create", &Program::stream_create)
-      .def("stream_destroy", &Program::stream_destroy)
-      .def("stream_synchronize", &Program::stream_synchronize)
-      .def("set_current_cuda_stream", &Program::set_current_cuda_stream)
-      .def("event_create", &Program::event_create)
-      .def("event_destroy", &Program::event_destroy)
-      .def("event_record", &Program::event_record)
-      .def("event_synchronize", &Program::event_synchronize)
-      .def("stream_wait_event", &Program::stream_wait_event)
+      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+      .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
+      .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
+      .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })
+      .def("event_create", [](Program *p) { return p->stream_manager().create_event(); })
+      .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); })
+      .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); })
+      .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); })
+      .def("stream_wait_event", [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); })
       .def("get_graph_cache_size", &Program::get_graph_cache_size)
       .def("get_graph_cache_used_on_last_call", &Program::get_graph_cache_used_on_last_call)
       .def("get_num_offloaded_tasks_on_last_call", &Program::get_num_offloaded_tasks_on_last_call)

From 7e102672eab2ff2713c26cd90445566b81d57a53 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 09:18:07 -0700
Subject: [PATCH 33/39] Reflow comment in program_stream.h to 120-char width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
index 54a8e88d0b..69265c26b3 100644
--- a/quadrants/program/program_stream.h
+++ b/quadrants/program/program_stream.h
@@ -1,6 +1,5 @@
-// StreamManager — manages CUDA stream and event lifecycle.
-// Isolated from Program so that backend-specific GPU plumbing
-// does not pollute the core Program interface.
+// StreamManager — manages CUDA stream and event lifecycle, isolated from Program so that backend-specific GPU
+// plumbing does not pollute the core Program interface.
 
 #pragma once
 

From 614c742cd9cfb0195ae32dedee09d4d7fd374949 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 09:19:08 -0700
Subject: [PATCH 34/39] Use captured prog_ref for all Stream/Event operations

All methods on Stream and Event now resolve the Program through the
captured weakref first, falling back to the current runtime only for
externally-wrapped handles.  Fixes a bug where destroy/synchronize/
record/wait would call into the wrong Program after qd.reset().

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 063a2aeafc..85e7c1e86b 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -22,10 +22,16 @@ def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
     def handle(self) -> int:
         return self._handle
 
+    def _prog(self):
+        if self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                return prog
+        return impl.get_runtime().prog
+
     def synchronize(self):
         """Block until all operations on this stream complete."""
-        prog = impl.get_runtime().prog
-        prog.stream_synchronize(self._handle)
+        self._prog().stream_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the stream. Safe to call multiple times.
@@ -33,8 +39,7 @@ def destroy(self):
         No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = impl.get_runtime().prog
-            prog.stream_destroy(self._handle)
+            self._prog().stream_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
@@ -69,22 +74,26 @@ def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
     def handle(self) -> int:
         return self._handle
 
+    def _prog(self):
+        if self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                return prog
+        return impl.get_runtime().prog
+
     def record(self, qd_stream: Stream | None = None):
         """Record this event on a stream. None means the default stream."""
-        prog = impl.get_runtime().prog
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        prog.event_record(self._handle, stream_handle)
+        self._prog().event_record(self._handle, stream_handle)
 
     def wait(self, qd_stream: Stream | None = None):
         """Make a stream wait for this event. None means the default stream."""
-        prog = impl.get_runtime().prog
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        prog.stream_wait_event(stream_handle, self._handle)
+        self._prog().stream_wait_event(stream_handle, self._handle)
 
     def synchronize(self):
         """Block the host until this event has been reached."""
-        prog = impl.get_runtime().prog
-        prog.event_synchronize(self._handle)
+        self._prog().event_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the event. Safe to call multiple times.
@@ -92,8 +101,7 @@ def destroy(self):
         No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = impl.get_runtime().prog
-            prog.event_destroy(self._handle)
+            self._prog().event_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):

From 3dad35ad4ad58bd92d034e0eb01a65a92705c897 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 09:31:35 -0700
Subject: [PATCH 35/39] Fix stale handle safety in Stream/Event after
 qd.reset()

When _prog_ref is set but the weakref has expired (Program destroyed),
_prog() now returns None instead of falling back to the current runtime.
Active operations (synchronize, record, wait) raise RuntimeError; destroy
silently no-ops and zeroes the handle.

Also allow synchronize_stream(0) to sync the default stream in CUDA,
matching cuStreamSynchronize(nullptr) semantics.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py      | 41 ++++++++++++++++++----------
 quadrants/program/program_stream.cpp |  2 +-
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 85e7c1e86b..5898cb434e 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -23,23 +23,28 @@ def handle(self) -> int:
         return self._handle
 
     def _prog(self):
+        """Resolve the owning Program, or None if the owner was collected."""
         if self._prog_ref is not None:
-            prog = self._prog_ref()
-            if prog is not None:
-                return prog
+            return self._prog_ref()
         return impl.get_runtime().prog
 
     def synchronize(self):
         """Block until all operations on this stream complete."""
-        self._prog().stream_synchronize(self._handle)
+        prog = self._prog()
+        if prog is None:
+            raise RuntimeError("Stream's owning Program has been destroyed (e.g. after qd.reset())")
+        prog.stream_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the stream. Safe to call multiple times.
 
-        No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
+        No-op if the owning Program has already been collected, or for streams wrapping external handles
+        (created via Stream(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            self._prog().stream_destroy(self._handle)
+            prog = self._prog()
+            if prog is not None:
+                prog.stream_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
@@ -75,33 +80,41 @@ def handle(self) -> int:
         return self._handle
 
     def _prog(self):
+        """Resolve the owning Program, or None if the owner was collected."""
         if self._prog_ref is not None:
-            prog = self._prog_ref()
-            if prog is not None:
-                return prog
+            return self._prog_ref()
         return impl.get_runtime().prog
 
+    def _require_prog(self):
+        prog = self._prog()
+        if prog is None:
+            raise RuntimeError("Event's owning Program has been destroyed (e.g. after qd.reset())")
+        return prog
+
     def record(self, qd_stream: Stream | None = None):
         """Record this event on a stream. None means the default stream."""
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        self._prog().event_record(self._handle, stream_handle)
+        self._require_prog().event_record(self._handle, stream_handle)
 
     def wait(self, qd_stream: Stream | None = None):
         """Make a stream wait for this event. None means the default stream."""
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        self._prog().stream_wait_event(stream_handle, self._handle)
+        self._require_prog().stream_wait_event(stream_handle, self._handle)
 
     def synchronize(self):
         """Block the host until this event has been reached."""
-        self._prog().event_synchronize(self._handle)
+        self._require_prog().event_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the event. Safe to call multiple times.
 
-        No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
+        No-op if the owning Program has already been collected, or for events wrapping external handles
+        (created via Event(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            self._prog().event_destroy(self._handle)
+            prog = self._prog()
+            if prog is not None:
+                prog.event_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index b1c2429dd6..8a7431532a 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -36,7 +36,7 @@ void StreamManager::destroy_stream(uint64 stream_handle) {
 
 void StreamManager::synchronize_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (arch_ == Arch::cuda && stream_handle != 0) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }

From bebc9040869cdbcdcf8094b80fb4c849f28f16ce Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:14:23 -0700
Subject: [PATCH 36/39] Extract stream/event pybind bindings into
 export_stream.cpp

Move the 9 stream/event .def() bindings from export_lang.cpp into a
new export_stream.cpp, following the existing export_math/export_misc
pattern.  Satisfies the feature-factorization check for the 1225-line
export_lang.cpp.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/python/export.h          |  6 ++++++
 quadrants/python/export_lang.cpp   | 14 +++-----------
 quadrants/python/export_stream.cpp | 26 ++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 quadrants/python/export_stream.cpp

diff --git a/quadrants/python/export.h b/quadrants/python/export.h
index 331c35b4b6..92736daedf 100644
--- a/quadrants/python/export.h
+++ b/quadrants/python/export.h
@@ -21,6 +21,10 @@
 
 #include "quadrants/common/core.h"
 
+namespace quadrants::lang {
+class Program;
+}  // namespace quadrants::lang
+
 namespace quadrants {
 
 namespace py = pybind11;
@@ -33,4 +37,6 @@ void export_math(py::module &m);
 
 void export_misc(py::module &m);
 
+void export_stream(py::module &m, py::class_<lang::Program> &program_class);
+
 }  // namespace quadrants
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index c46d40ac10..b3dc79bef5 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -314,8 +314,8 @@ void export_lang(py::module &m) {
   auto compiled_kernel_data = py::class_<CompiledKernelData>(m, "CompiledKernelData")
                                   .def("_debug_dump_to_string", &CompiledKernelData::debug_dump_to_string);
 
-  py::class_<Program>(m, "Program")
-      .def(py::init<>())
+  auto program_class = py::class_<Program>(m, "Program");
+  program_class.def(py::init<>())
       .def(
           "ndarray_to_dlpack",
           [](Program *program, pybind11::object owner, Ndarray *ndarray, const std::vector<int> &layout,
@@ -406,20 +406,12 @@ void export_lang(py::module &m) {
       .def("compile_kernel", &Program::compile_kernel, py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
       .def("get_device_caps", &Program::get_device_caps)
-      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
-      .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
-      .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
-      .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })
-      .def("event_create", [](Program *p) { return p->stream_manager().create_event(); })
-      .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); })
-      .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); })
-      .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); })
-      .def("stream_wait_event", [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); })
       .def("get_graph_cache_size", &Program::get_graph_cache_size)
       .def("get_graph_cache_used_on_last_call", &Program::get_graph_cache_used_on_last_call)
       .def("get_num_offloaded_tasks_on_last_call", &Program::get_num_offloaded_tasks_on_last_call)
       .def("get_graph_num_nodes_on_last_call", &Program::get_graph_num_nodes_on_last_call)
       .def("get_graph_total_builds", &Program::get_graph_total_builds);
+  export_stream(m, program_class);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp
new file mode 100644
index 0000000000..f3f2fad525
--- /dev/null
+++ b/quadrants/python/export_stream.cpp
@@ -0,0 +1,26 @@
+/*******************************************************************************
+    Copyright (c) The Quadrants Authors (2016- ). All Rights Reserved.
+    The use of this software is governed by the LICENSE file.
+*******************************************************************************/
+
+#include "quadrants/python/export.h"
+#include "quadrants/program/program.h"
+
+namespace quadrants {
+
+void export_stream(py::module &m, py::class_<lang::Program> &program_class) {
+  using lang::Program;
+  program_class
+      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+      .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
+      .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
+      .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })
+      .def("event_create", [](Program *p) { return p->stream_manager().create_event(); })
+      .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); })
+      .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); })
+      .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); })
+      .def("stream_wait_event",
+           [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); });
+}
+
+}  // namespace quadrants

From 3b09331daf736eb85d220bdd6760dd5e5e553bd2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:24:59 -0700
Subject: [PATCH 37/39] Fix clang-format in export_stream.cpp

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/python/export_stream.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp
index f3f2fad525..66b3c8a3d7 100644
--- a/quadrants/python/export_stream.cpp
+++ b/quadrants/python/export_stream.cpp
@@ -10,8 +10,7 @@ namespace quadrants {
 
 void export_stream(py::module &m, py::class_<lang::Program> &program_class) {
   using lang::Program;
-  program_class
-      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+  program_class.def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
       .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
       .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
       .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })

From 48c3922acac9a7959cda3fbec90aaa4cdbabbb1a Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 12:20:34 -0700
Subject: [PATCH 38/39] Fall back to current runtime for Stream/Event destroy
 after reset

When the owning Program has been collected (e.g. after qd.reset()),
destroy() and __del__ now fall back to the current runtime's Program
to free the underlying CUDA resource.  This is safe because CUDAContext
is a singleton, so stream/event handles remain valid across Programs.
Prevents resource leaks in create/reset cycles.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py | 42 ++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 5898cb434e..6187b6f9c4 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -35,21 +35,34 @@ def synchronize(self):
             raise RuntimeError("Stream's owning Program has been destroyed (e.g. after qd.reset())")
         prog.stream_synchronize(self._handle)
 
+    def _destroy_prog(self):
+        """Resolve a Program for resource cleanup.
+
+        Falls back to the current runtime when the owner has been collected, which is safe because
+        CUDAContext is a singleton so the CUDA stream handle remains valid.
+        """
+        prog = self._prog()
+        if prog is None:
+            try:
+                return impl.get_runtime().prog
+            except Exception:
+                return None
+        return prog
+
     def destroy(self):
         """Explicitly destroy the stream. Safe to call multiple times.
 
-        No-op if the owning Program has already been collected, or for streams wrapping external handles
-        (created via Stream(ptr) without a prog_ref).
+        No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog()
+            prog = self._destroy_prog()
             if prog is not None:
                 prog.stream_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog_ref()
+            prog = self._destroy_prog()
             if prog is not None:
                 try:
                     prog.stream_destroy(self._handle)
@@ -105,21 +118,34 @@ def synchronize(self):
         """Block the host until this event has been reached."""
         self._require_prog().event_synchronize(self._handle)
 
+    def _destroy_prog(self):
+        """Resolve a Program for resource cleanup.
+
+        Falls back to the current runtime when the owner has been collected, which is safe because
+        CUDAContext is a singleton so the CUDA event handle remains valid.
+        """
+        prog = self._prog()
+        if prog is None:
+            try:
+                return impl.get_runtime().prog
+            except Exception:
+                return None
+        return prog
+
     def destroy(self):
         """Explicitly destroy the event. Safe to call multiple times.
 
-        No-op if the owning Program has already been collected, or for events wrapping external handles
-        (created via Event(ptr) without a prog_ref).
+        No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog()
+            prog = self._destroy_prog()
             if prog is not None:
                 prog.event_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog_ref()
+            prog = self._destroy_prog()
             if prog is not None:
                 try:
                     prog.event_destroy(self._handle)

From 44ee707afa655e728ebf452d0e4102de3e75da7f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 15:04:03 -0700
Subject: [PATCH 39/39] Reflow _destroy_prog docstrings to 120-char width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 6187b6f9c4..e87816568c 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -38,8 +38,8 @@ def synchronize(self):
     def _destroy_prog(self):
         """Resolve a Program for resource cleanup.
 
-        Falls back to the current runtime when the owner has been collected, which is safe because
-        CUDAContext is a singleton so the CUDA stream handle remains valid.
+        Falls back to the current runtime when the owner has been collected, which is safe because CUDAContext is a
+        singleton so the CUDA stream handle remains valid.
         """
         prog = self._prog()
         if prog is None:
@@ -121,8 +121,8 @@ def synchronize(self):
     def _destroy_prog(self):
         """Resolve a Program for resource cleanup.
 
-        Falls back to the current runtime when the owner has been collected, which is safe because
-        CUDAContext is a singleton so the CUDA event handle remains valid.
+        Falls back to the current runtime when the owner has been collected, which is safe because CUDAContext is a
+        singleton so the CUDA event handle remains valid.
         """
         prog = self._prog()
         if prog is None: