From 7bd18ca4e1a9b6e99632af2c7c62076b4195ae3d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 16:42:22 -0700
Subject: [PATCH 01/28] Add AMDGPU/HIP stream support and async memory
 operations

Mirrors the CUDA stream implementation for HIP: adds stream_ member
to AMDGPUContext, stream_destroy/stream_wait_event/malloc_async/
mem_free_async to HIP driver functions, and AMDGPU branches in all
Program stream/event methods. Converts AMDGPU kernel launcher to use
async memory operations through the active stream. CPU backend
returns 0 handles (no-op).
---
 quadrants/program/program.cpp                 | 64 ++++++++++++++
 quadrants/rhi/amdgpu/amdgpu_context.cpp       |  4 +-
 quadrants/rhi/amdgpu/amdgpu_context.h         |  9 ++
 .../rhi/amdgpu/amdgpu_driver_functions.inc.h  |  8 ++
 quadrants/runtime/amdgpu/kernel_launcher.cpp  | 51 +++++------
 tests/python/test_streams.py                  | 84 ++++++++++++++++++-
 6 files changed, 191 insertions(+), 29 deletions(-)
diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 9b2ff0886b..f4bb8da35b 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -25,6 +25,11 @@
 #include "quadrants/rhi/cuda/cuda_context.h"
 #endif
 
+#ifdef QD_WITH_AMDGPU
+#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
+#include "quadrants/rhi/amdgpu/amdgpu_context.h"
+#endif
+
 #ifdef QD_WITH_VULKAN
 #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h"
 #include "quadrants/rhi/vulkan/vulkan_loader.h"
@@ -493,6 +498,13 @@ uint64 Program::stream_create() {
     CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/);
     return reinterpret_cast<uint64>(stream);
   }
+#endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    void *stream = nullptr;
+    AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    return reinterpret_cast<uint64>(stream);
+  }
 #endif
   return 0;
 }
@@ -504,6 +516,12 @@ void Program::stream_destroy(uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+    AMDGPUDriver::get_instance().stream_destroy(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 void Program::stream_synchronize(uint64 stream_handle) {
@@ -513,6 +531,12 @@ void Program::stream_synchronize(uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUDriver::get_instance().stream_synchronize(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 void Program::set_current_cuda_stream(uint64 stream_handle) {
@@ -522,6 +546,12 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().set_stream(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 uint64 Program::event_create() {
@@ -532,6 +562,14 @@ uint64 Program::event_create() {
                                             0x02 /*CU_EVENT_DISABLE_TIMING*/);
     return reinterpret_cast<uint64>(event);
   }
+#endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    void *event = nullptr;
+    AMDGPUDriver::get_instance().event_create(&event,
+                                              0x02 /*hipEventDisableTiming*/);
+    return reinterpret_cast<uint64>(event);
+  }
 #endif
   return 0;
 }
@@ -543,6 +581,12 @@ void Program::event_destroy(uint64 event_handle) {
         reinterpret_cast<void *>(event_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().event_destroy(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
 }
 
 void Program::event_record(uint64 event_handle, uint64 stream_handle) {
@@ -553,6 +597,13 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().event_record(
+        reinterpret_cast<void *>(event_handle),
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 void Program::event_synchronize(uint64 event_handle) {
@@ -562,6 +613,12 @@ void Program::event_synchronize(uint64 event_handle) {
         reinterpret_cast<void *>(event_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().event_synchronize(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
 }
 
 void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
@@ -572,6 +629,13 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
         reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().stream_wait_event(
+        reinterpret_cast<void *>(stream_handle),
+        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+  }
+#endif
 }
 
 }  // namespace quadrants::lang
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp
index 22f55339ee..f940ed9a7c 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp
@@ -188,7 +188,7 @@ void AMDGPUContext::launch(void *func,
     void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02,
                       (void *)&pack_size, (void *)0x03};
     driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1,
-                          dynamic_shared_mem_bytes, nullptr, nullptr,
+                          dynamic_shared_mem_bytes, stream_, nullptr,
                           reinterpret_cast<void **>(&config));
   }
   std::free(packed_arg);
@@ -197,7 +197,7 @@ void AMDGPUContext::launch(void *func,
     profiler_->stop(task_handle);
 
   if (debug_) {
-    driver_.stream_synchronize(nullptr);
+    driver_.stream_synchronize(stream_);
   }
 }
 
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index 9529953bf1..68e7cd7314 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -23,6 +23,7 @@ class AMDGPUContext {
   KernelProfilerBase *profiler_{nullptr};
   AMDGPUDriver &driver_;
   bool debug_{false};
+  void *stream_{nullptr};
   std::vector<void *> kernel_arg_pointer_;
 
  public:
@@ -116,6 +117,14 @@ class AMDGPUContext {
     return std::unique_lock<std::mutex>(lock_);
   }
 
+  void set_stream(void *stream) {
+    stream_ = stream;
+  }
+
+  void *get_stream() const {
+    return stream_;
+  }
+
   static AMDGPUContext &get_instance();
 };
 
diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index dbb3612c87..6063d268a9 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -26,6 +26,7 @@ PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);
 
 // Stream management
 PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32);
+PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);
 
 // Memory management
 PER_AMDGPU_FUNCTION(memcpy_host_to_device,
@@ -69,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async,
                     std::size_t,
                     void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
+PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed,
                     hipMallocManaged,
                     void **,
@@ -76,6 +78,7 @@ PER_AMDGPU_FUNCTION(malloc_managed,
                     uint32);
 PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
 PER_AMDGPU_FUNCTION(mem_free, hipFree, void *);
+PER_AMDGPU_FUNCTION(mem_free_async, hipFreeAsync, void *, void *);
 PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *);
 PER_AMDGPU_FUNCTION(mem_get_attribute,
                     hipPointerGetAttribute,
@@ -121,6 +124,11 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy,
 
 // Stream management
 PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *);
+PER_AMDGPU_FUNCTION(stream_wait_event,
+                    hipStreamWaitEvent,
+                    void *,
+                    void *,
+                    uint32);
 
 // Event management
 PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 6ef0b0e0e5..1d8430d35e 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -1,5 +1,6 @@
 #include "quadrants/runtime/amdgpu/kernel_launcher.h"
 #include "quadrants/rhi/amdgpu/amdgpu_context.h"
+#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
 #include "quadrants/program/launch_context_builder.h"
 
 namespace quadrants::lang {
@@ -32,18 +33,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       transfers;
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
+  auto *active_stream = AMDGPUContext::get_instance().get_stream();
+
   char *device_result_buffer{nullptr};
-  // Here we have to guarantee the result_result_buffer isn't nullptr
-  // It is interesting - The code following
-  // L60:           DeviceAllocation devalloc =
-  // executor->allocate_memory_on_device( call another kernel and it will result
-  // in
-  //   Memory access fault by GPU node-1 (Agent handle: 0xeda5ca0) on address
-  //   (nil). Reason: Page not present or supervisor privilege.
-  // if you don't allocate it.
-  AMDGPUDriver::get_instance().malloc(
+  // Must always allocate device_result_buffer (even when result_buffer_size
+  // is 0) to avoid memory access faults from allocate_memory_on_device below.
+  AMDGPUDriver::get_instance().malloc_async(
       (void **)&device_result_buffer,
-      std::max(ctx.result_buffer_size, sizeof(uint64)));
+      std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream);
 
   for (int i = 0; i < (int)parameters.size(); i++) {
     const auto &kv = parameters[i];
@@ -86,27 +83,28 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     }
   }
   if (transfers.size() > 0) {
-    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
   char *host_result_buffer = (char *)ctx.get_context().result_buffer;
   if (ctx.result_buffer_size > 0) {
-    // Malloc_Async and Free_Async are available after ROCm 5.4
     ctx.get_context().result_buffer = (uint64 *)device_result_buffer;
   }
   char *device_arg_buffer = nullptr;
   if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().malloc((void **)&device_arg_buffer,
-                                        ctx.arg_buffer_size);
-    AMDGPUDriver::get_instance().memcpy_host_to_device(
-        device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size);
+    AMDGPUDriver::get_instance().malloc_async(
+        (void **)&device_arg_buffer, ctx.arg_buffer_size, active_stream);
+    AMDGPUDriver::get_instance().memcpy_host_to_device_async(
+        device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size,
+        active_stream);
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
   void *context_pointer;
   int arg_size = sizeof(RuntimeContext *);
-  AMDGPUDriver::get_instance().malloc((void **)&context_pointer,
-                                      sizeof(RuntimeContext));
-  AMDGPUDriver::get_instance().memcpy_host_to_device(
-      context_pointer, &ctx.get_context(), sizeof(RuntimeContext));
+  AMDGPUDriver::get_instance().malloc_async(
+      (void **)&context_pointer, sizeof(RuntimeContext), active_stream);
+  AMDGPUDriver::get_instance().memcpy_host_to_device_async(
+      context_pointer, &ctx.get_context(), sizeof(RuntimeContext),
+      active_stream);
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
@@ -119,13 +117,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
   }
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().mem_free(device_arg_buffer);
+    AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer,
+                                                active_stream);
   }
   if (ctx.result_buffer_size > 0) {
-    AMDGPUDriver::get_instance().memcpy_device_to_host(
-        host_result_buffer, device_result_buffer, ctx.result_buffer_size);
+    AMDGPUDriver::get_instance().memcpy_device_to_host_async(
+        host_result_buffer, device_result_buffer, ctx.result_buffer_size,
+        active_stream);
   }
   if (transfers.size()) {
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
       auto arg_id = idx.arg_id;
@@ -135,8 +136,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
-  // Since we always allocating above then we should always free
-  AMDGPUDriver::get_instance().mem_free(device_result_buffer);
+  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
+                                              active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index fabc217e96..073d383c2e 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -8,7 +8,7 @@
 from tests import test_utils
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test(arch=[qd.cuda, qd.amdgpu])
 def test_create_and_destroy_stream():
     s = qd.create_stream()
     assert isinstance(s, Stream)
@@ -17,7 +17,7 @@ def test_create_and_destroy_stream():
     assert s.handle == 0
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test(arch=[qd.cuda, qd.amdgpu])
 def test_create_and_destroy_event():
     e = qd.create_event()
     assert isinstance(e, Event)
@@ -195,3 +195,83 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     s.synchronize()
     assert np.allclose(arr.to_numpy(), 99.0)
     s.destroy()
+
+
+@test_utils.test()
+def test_concurrent_streams_with_events():
+    """Two slow kernels on separate streams run concurrently (~1s on GPU),
+    serial fallback on CPU/Metal."""
+    SPIN_ITERS = 5_000_000
+
+    @qd.kernel
+    def slow_fill(
+        a: qd.types.ndarray(dtype=qd.f32, ndim=1),
+        lcg_state: qd.types.ndarray(dtype=qd.i32, ndim=1),
+        index: qd.i32,
+        value: qd.f32,
+    ):
+        qd.loop_config(block_dim=1)
+        for _ in range(1):
+            x = lcg_state[index]
+            for _j in range(SPIN_ITERS):
+                x = (1664525 * x + 1013904223) % 2147483647
+            lcg_state[index] = x
+            a[index] = value
+
+    @qd.kernel
+    def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        qd.loop_config(block_dim=1)
+        for _ in range(1):
+            a[2] = a[0] + a[1]
+
+    import time
+
+    # Warm up JIT
+    a_warmup = qd.ndarray(qd.f32, shape=(3,))
+    lcg_warmup = qd.ndarray(qd.i32, shape=(3,))
+    slow_fill(a_warmup, lcg_warmup, 0, 0.0)
+    add_first_two(a_warmup)
+    qd.sync()
+
+    # Serial baseline
+    a = qd.ndarray(qd.f32, shape=(3,))
+    lcg = qd.ndarray(qd.i32, shape=(3,))
+    qd.sync()
+    t0 = time.perf_counter()
+    slow_fill(a, lcg, 0, 5.0)
+    slow_fill(a, lcg, 1, 7.0)
+    add_first_two(a)
+    qd.sync()
+    serial_time = time.perf_counter() - t0
+    assert np.isclose(a.to_numpy()[2], 12.0)
+
+    # Streams
+    a = qd.ndarray(qd.f32, shape=(3,))
+    lcg = qd.ndarray(qd.i32, shape=(3,))
+    s1 = qd.create_stream()
+    s2 = qd.create_stream()
+    e1 = qd.create_event()
+    e2 = qd.create_event()
+    qd.sync()
+    t0 = time.perf_counter()
+    slow_fill(a, lcg, 0, 5.0, qd_stream=s1)
+    slow_fill(a, lcg, 1, 7.0, qd_stream=s2)
+    e1.record(s1)
+    e2.record(s2)
+    e1.wait()
+    e2.wait()
+    add_first_two(a)
+    qd.sync()
+    stream_time = time.perf_counter() - t0
+    assert np.isclose(a.to_numpy()[2], 12.0)
+
+    speedup = serial_time / stream_time
+    if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu):
+        assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x"
+    else:
+        assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x"
+
+    s1.destroy()
+    s2.destroy()
+    e1.destroy()
+    e2.destroy()

From 7555ec5edf0581290df8b902b5a31e6162521fe3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:03 -0700
Subject: [PATCH 02/28] Move AMDGPU mem_free_async before transfers sync to
 match CUDA ordering

Batch the device_result_buffer free into the stream pipeline before the
sync barrier, matching the CUDA kernel launcher's ordering for
consistency and marginal performance improvement.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 1d8430d35e..cff0f2b4a1 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -125,6 +125,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         host_result_buffer, device_result_buffer, ctx.result_buffer_size,
         active_stream);
   }
+  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
+                                              active_stream);
   if (transfers.size()) {
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
@@ -136,8 +138,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
-  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
-                                              active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(

From c12d23e1e1426a0b538382cb5dcab489e4c09b2e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:18 -0700
Subject: [PATCH 03/28] Convert AMDGPU sync memcpy_host_to_device to async on
 active_stream

Use memcpy_host_to_device_async for external array transfers so they
are properly ordered on the active stream, matching the CUDA launcher.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index cff0f2b4a1..f772fc7b5b 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -66,8 +66,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
               executor->get_device_alloc_info_ptr(devalloc);
           transfers[data_ptr_idx] = {data_ptr, devalloc};
 
-          AMDGPUDriver::get_instance().memcpy_host_to_device(
-              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz);
+          AMDGPUDriver::get_instance().memcpy_host_to_device_async(
+              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz,
+              active_stream);
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);

From 1673a38761b50fb6af4767e569fbf88751bb4788 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:25 -0700
Subject: [PATCH 04/28] Document ROCm >= 5.4 requirement for
 hipMallocAsync/hipFreeAsync

---
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index 6063d268a9..25e33774e7 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -70,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async,
                     std::size_t,
                     void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
+// hipMallocAsync/hipFreeAsync require ROCm >= 5.4
 PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed,
                     hipMallocManaged,

From 60d015bfddac7068d1d1067d8f059e9c3236447e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:35 -0700
Subject: [PATCH 05/28] Relax concurrency test threshold and log timings

Lower GPU speedup threshold from 1.5x to 1.3x to reduce flakiness in
CI under contention, and print actual timings for diagnostics.
---
 tests/python/test_streams.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 073d383c2e..236578974d 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -266,8 +266,9 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     assert np.isclose(a.to_numpy()[2], 12.0)
 
     speedup = serial_time / stream_time
+    print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x")
     if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu):
-        assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x"
+        assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x"
     else:
         assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x"
 

From c4be4ffd7c77a68ed6176ce30900d1a2260dec5b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:55 -0700
Subject: [PATCH 06/28] Add handle==0 guard to AMDGPU stream_synchronize and
 make stream_ thread_local

Mirror the CUDA fixes: guard stream_synchronize against handle==0 to
avoid unintentional default stream sync, and make AMDGPUContext::stream_
thread_local for thread-safety.
---
 quadrants/program/program.cpp           | 2 +-
 quadrants/rhi/amdgpu/amdgpu_context.cpp | 2 ++
 quadrants/rhi/amdgpu/amdgpu_context.h   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index faac67970c..8bab1d30f7 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -532,7 +532,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
     AMDGPUDriver::get_instance().stream_synchronize(
         reinterpret_cast<void *>(stream_handle));
   }
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp
index f940ed9a7c..24d924ed0d 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp
@@ -13,6 +13,8 @@
 namespace quadrants {
 namespace lang {
 
+thread_local void *AMDGPUContext::stream_ = nullptr;
+
 AMDGPUContext::AMDGPUContext()
     : driver_(AMDGPUDriver::get_instance_without_context()) {
   dev_count_ = 0;
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index 68e7cd7314..4fc7c8328b 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -23,7 +23,7 @@ class AMDGPUContext {
   KernelProfilerBase *profiler_{nullptr};
   AMDGPUDriver &driver_;
   bool debug_{false};
-  void *stream_{nullptr};
+  static thread_local void *stream_;
   std::vector<void *> kernel_arg_pointer_;
 
  public:

From b28e7c60901fdde76ff2b9ea153534f15a0050ac Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 18:23:15 -0700
Subject: [PATCH 07/28] Revert "Relax concurrency test threshold and log
 timings"

This reverts commit 60d015bfddac7068d1d1067d8f059e9c3236447e.
---
 tests/python/test_streams.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 236578974d..073d383c2e 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -266,9 +266,8 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     assert np.isclose(a.to_numpy()[2], 12.0)
 
     speedup = serial_time / stream_time
-    print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x")
     if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu):
-        assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x"
+        assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x"
     else:
         assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x"
 

From 31fffbf1730e32c200eed37e8b4a4740ddc28b50 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 19 Apr 2026 19:03:53 -0700
Subject: [PATCH 08/28] Apply clang-format

Made-with: Cursor
---
 quadrants/program/program.cpp                 | 28 +++++++------------
 .../rhi/amdgpu/amdgpu_driver_functions.inc.h  |  6 +---
 quadrants/runtime/amdgpu/kernel_launcher.cpp  | 18 ++++++------
 3 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 43e8df1236..648f3291c3 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -491,8 +491,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
-    AMDGPUDriver::get_instance().stream_destroy(
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -505,8 +504,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
-    AMDGPUDriver::get_instance().stream_synchronize(
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -519,8 +517,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
-    AMDGPUContext::get_instance().set_stream(
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -536,8 +533,7 @@ uint64 Program::event_create() {
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
     void *event = nullptr;
-    AMDGPUDriver::get_instance().event_create(&event,
-                                              0x02 /*hipEventDisableTiming*/);
+    AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/);
     return reinterpret_cast<uint64>(event);
   }
 #endif
@@ -552,8 +548,7 @@ void Program::event_destroy(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().event_destroy(
-        reinterpret_cast<void *>(event_handle));
+    AMDGPUDriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -567,9 +562,8 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().event_record(
-        reinterpret_cast<void *>(event_handle),
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUDriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
+                                              reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -582,8 +576,7 @@ void Program::event_synchronize(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().event_synchronize(
-        reinterpret_cast<void *>(event_handle));
+    AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -597,9 +590,8 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().stream_wait_event(
-        reinterpret_cast<void *>(stream_handle),
-        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+    AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
+                                                   reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }
 #endif
 }
diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index 6a01c3a87a..6be39db108 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -62,11 +62,7 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy, hipOccupancyMaxActiveBlocksPerMultipro
 
 // Stream management
 PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *);
-PER_AMDGPU_FUNCTION(stream_wait_event,
-                    hipStreamWaitEvent,
-                    void *,
-                    void *,
-                    uint32);
+PER_AMDGPU_FUNCTION(stream_wait_event, hipStreamWaitEvent, void *, void *, uint32);
 
 // Event management
 PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 1c5c573d85..cace0821ce 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -86,16 +86,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
           device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(devalloc);
           transfers[data_ptr_idx] = {data_ptr, devalloc};
 
-          AMDGPUDriver::get_instance().memcpy_host_to_device_async(
-              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, active_stream);
+          AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz,
+                                                                   active_stream);
           if (grad_ptr != nullptr) {
             DeviceAllocation grad_devalloc =
                 executor->allocate_memory_on_device(arr_sz, (uint64 *)device_result_buffer);
             device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(grad_devalloc);
             transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc};
 
-            AMDGPUDriver::get_instance().memcpy_host_to_device_async(
-                (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz, active_stream);
+            AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[grad_ptr_idx], grad_ptr,
+                                                                     arr_sz, active_stream);
           } else {
             device_ptrs[grad_ptr_idx] = nullptr;
           }
@@ -141,8 +141,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   void *context_pointer;
   int arg_size = sizeof(RuntimeContext *);
   AMDGPUDriver::get_instance().malloc_async((void **)&context_pointer, sizeof(RuntimeContext), active_stream);
-  AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(),
-                                                           sizeof(RuntimeContext), active_stream);
+  AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext),
+                                                           active_stream);
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
@@ -154,15 +154,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   }
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer,
-                                                active_stream);
+    AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, active_stream);
   }
   if (ctx.result_buffer_size > 0) {
     AMDGPUDriver::get_instance().memcpy_device_to_host_async(host_result_buffer, device_result_buffer,
                                                              ctx.result_buffer_size, active_stream);
   }
-  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
-                                              active_stream);
+  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, active_stream);
   if (transfers.size()) {
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {

From 798f87a18139fb8799d9b1d91135b2f6b8066a8d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 24 Apr 2026 04:55:29 -0700
Subject: [PATCH 09/28] Exclude flaky test_perf_dispatch_python from Metal and
 Vulkan

The pure-Python perf_dispatch timing test is unreliable on Mac Metal
and Vulkan (MoltenVK) where timing differences between implementations
are too small to consistently pick the fastest one.

Made-with: Cursor
---
 tests/python/test_perf_dispatch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/test_perf_dispatch.py b/tests/python/test_perf_dispatch.py
index eaef03d99f..2de074ed3c 100644
--- a/tests/python/test_perf_dispatch.py
+++ b/tests/python/test_perf_dispatch.py
@@ -109,7 +109,7 @@ def my_func1_impl_a_shape0_ge_2(
     assert len(speed_checker._trial_count_by_dispatch_impl_by_geometry_hash[geometry]) == 2
 
 
-@test_utils.test()
+@test_utils.test(exclude=[qd.metal, qd.vulkan])
 def test_perf_dispatch_python() -> None:
     WARMUP = 3
 

From 22389690c487e1bc05da15ed213b7e2f7bb0d7ed Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 08:43:36 -0700
Subject: [PATCH 10/28] [Doc] Update streams doc with AMDGPU support

---
 docs/source/user_guide/streams.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0a610fd217..cd26e01d20 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -9,6 +9,7 @@ and control synchronization with events.
 | Backend | Streams | Events | Notes |
 |---------|---------|--------|-------|
 | CUDA    | Yes     | Yes    | Full concurrent execution |
+| AMDGPU  | Yes     | Yes    | Full concurrent execution (requires ROCm >= 5.4) |
 | CPU     | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 | Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 | Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |

From 8efd51f116d3825d152ee67bfbb2430a5ee25d6b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:04:18 -0700
Subject: [PATCH 11/28] Address review comments: fix AMDGPU stream issues

- Fix stream_synchronize(nullptr) in do-while loop to sync active stream,
  mirroring the CUDA path (claude red)
- Remove unused kernel_arg_pointer_ member from AMDGPUContext (claude yellow)
- Reword misleading ROCm fallback comment to clarify it's per-device, not
  per-runtime-version (claude yellow)
- Fix stream_create ABI: bind to hipStreamCreateWithFlags instead of
  hipStreamCreate to match the two-arg call signature (codex P2)
---
 quadrants/rhi/amdgpu/amdgpu_context.h              | 1 -
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++--
 quadrants/runtime/amdgpu/kernel_launcher.cpp       | 3 ++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index b9fd5c403c..083406c3f9 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -25,7 +25,6 @@ class AMDGPUContext {
   bool debug_{false};
   bool supports_mem_pool_{false};
   static thread_local void *stream_;
-  std::vector<void *> kernel_arg_pointer_;
 
  public:
   AMDGPUContext();
diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index b6a4d7ba3e..d91afcac00 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -16,7 +16,7 @@ PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *);
 PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);
 
 // Stream management
-PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32);
+PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32);
 PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);
 
 // Memory management
@@ -29,7 +29,7 @@ PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, voi
 PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
 // hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers
-// transparently fall back to the synchronous variants when unsupported.
+// fall back to the synchronous variants on devices without memory-pool support.
 PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32);
 PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 5bb5e70194..d54331f237 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -71,7 +71,8 @@ void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder &
   do {
     launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks, context_pointer, arg_size);
     counter_val = 0;
-    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+    auto *stream = AMDGPUContext::get_instance().get_stream();
+    AMDGPUDriver::get_instance().stream_synchronize(stream);
     AMDGPUDriver::get_instance().memcpy_device_to_host(&counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
   } while (counter_val != 0);
 }

From 34e9fa6aa47672ad4a59d2d2d4e952b1aec66698 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:14:01 -0700
Subject: [PATCH 12/28] Use HIP_STREAM_NON_BLOCKING for AMDGPU stream_create to
 mirror CUDA path

---
 quadrants/program/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 36c27942d0..f3fdeef548 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -510,7 +510,7 @@ uint64 Program::stream_create() {
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
     void *stream = nullptr;
-    AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
   }
 #endif

From 162239e38cbd9ce3fcd1365181c1f3470be194d8 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:35:34 -0700
Subject: [PATCH 13/28] Use active stream for AMDGPU adstack metadata copies in
 publish_adstack_metadata

AMDGPUContext::launch now dispatches on the user stream, so the
adstack H2D copies must target the same stream to maintain ordering.
Mirrors the CUDA branch.
---
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 69be9408b5..bc319f9c38 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -851,11 +851,10 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
       std::memcpy(pinned + 1 + n_stacks, host_max_sizes.data(), array_bytes);
 
       // Queue the metadata copies on the same stream the subsequent main-kernel dispatch will run on, so the
-      // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. On CUDA the active
-      // stream is `CUDAContext::get_instance().get_stream()` - configurable via `set_stream`, defaults to the
-      // null stream - and `CUDAContext::launch` dispatches kernels on the same handle. AMDGPU has no
-      // public stream-selection API: `AMDGPUContext::launch` always passes `nullptr` to `hipLaunchKernel`
-      // (i.e. the default stream), so the copies match that.
+      // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. Both CUDA and AMDGPU
+      // fetch the active stream from their respective context singletons (configurable via `set_stream`,
+      // defaults to the null stream), matching the stream used by `CUDAContext::launch` /
+      // `AMDGPUContext::launch`.
 #if defined(QD_WITH_CUDA)
       if (config_.arch == Arch::cuda) {
         void *active_stream = CUDAContext::get_instance().get_stream();
@@ -869,7 +868,7 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
 #endif
 #if defined(QD_WITH_AMDGPU)
       if (config_.arch == Arch::amdgpu) {
-        void *active_stream = nullptr;  // AMDGPUContext::launch always uses the default stream.
+        void *active_stream = AMDGPUContext::get_instance().get_stream();
         AMDGPUDriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned,
                                                                  header_bytes, active_stream);
         AMDGPUDriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes,

From 9334efd4f102def5c5458e7ccd0a99f63e80d63e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:46:42 -0700
Subject: [PATCH 14/28] Add make_current() to all AMDGPU stream/event Program
 methods

Mirrors commit 8b3d4ed from the CUDA path: HIP uses the same
primary-context-per-thread model, so calling these methods from a
non-init thread requires make_current() to bind the context first.
---
 quadrants/program/program.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 89972bdf6f..2c9e57e378 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -510,6 +510,7 @@ uint64 Program::stream_create() {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().make_current();
     void *stream = nullptr;
     AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
@@ -527,6 +528,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -541,6 +543,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -555,6 +558,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -571,6 +575,7 @@ uint64 Program::event_create() {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().make_current();
     void *event = nullptr;
     AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/);
     return reinterpret_cast<uint64>(event);
@@ -588,6 +593,7 @@ void Program::event_destroy(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -603,6 +609,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                               reinterpret_cast<void *>(stream_handle));
   }
@@ -618,6 +625,7 @@ void Program::event_synchronize(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -633,6 +641,7 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                    reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }

From 1fba4f56f6a0a2a276ffb7bd23c2d8a6374fde6b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:57:54 -0700
Subject: [PATCH 15/28] Use async DtoH on active_stream for AMDGPU
 resolve_num_threads readback

Mirrors aa4a70f from the CUDA path: with non-blocking user streams,
synchronous DtoH on the NULL stream has no ordering with the prep
task's store on active_stream, risking stale begin/end values.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index d54331f237..bb19087586 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -25,15 +25,17 @@ std::size_t resolve_num_threads(const OffloadedTask &task, LlvmRuntimeExecutor *
   std::int32_t begin = info.begin_const_value;
   std::int32_t end = info.end_const_value;
   if (info.begin_offset_bytes >= 0 || info.end_offset_bytes >= 0) {
+    auto *active_stream = AMDGPUContext::get_instance().get_stream();
     auto *temp_dev_ptr = reinterpret_cast<uint8_t *>(executor->get_runtime_temporaries_device_ptr());
     if (info.begin_offset_bytes >= 0) {
-      AMDGPUDriver::get_instance().memcpy_device_to_host(&begin, temp_dev_ptr + info.begin_offset_bytes,
-                                                         sizeof(std::int32_t));
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(&begin, temp_dev_ptr + info.begin_offset_bytes,
+                                                               sizeof(std::int32_t), active_stream);
     }
     if (info.end_offset_bytes >= 0) {
-      AMDGPUDriver::get_instance().memcpy_device_to_host(&end, temp_dev_ptr + info.end_offset_bytes,
-                                                         sizeof(std::int32_t));
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(&end, temp_dev_ptr + info.end_offset_bytes,
+                                                               sizeof(std::int32_t), active_stream);
     }
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
   // Clamp the logical iteration count to the launched thread count: adstack slices are indexed by
   // `linear_thread_idx()`, so only `static_num_threads = grid_dim * block_dim` slices can be touched

From f89bde02c5497856745bc93dd73fd2825ad2d489 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:19:16 -0700
Subject: [PATCH 16/28] Sync active_stream unconditionally at end of AMDGPU
 launch_llvm_kernel

Mirrors 5901a7fc from the CUDA path: when transfers is empty, the
result-buffer DtoH and mem_free_async were left in-flight on a
non-blocking stream with no sync before return. Also converts transfer
DtoH copies to async to match CUDA.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index bb19087586..0c5b4bad05 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -211,13 +211,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
-      auto arg_id = idx.arg_id;
-      AMDGPUDriver::get_instance().memcpy_device_to_host(itr->second.first, (void *)device_ptrs[idx],
-                                                         ctx.array_runtime_sizes[arg_id]);
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
+                                                               ctx.array_runtime_sizes[idx.arg_id], active_stream);
+    }
+    for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
   AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream);
+  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From ef3b95b18361dce692b02e4beff5a0a496fb5ff3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:30:20 -0700
Subject: [PATCH 17/28] Use async DtoH on active_stream for sizer stride
 readback

The sizer kernel now runs on the user stream via Context::launch, so
the synchronous copy_d2h (NULL stream) can read stale stride values
with non-blocking streams. Use stream-aware async DtoH + sync for
both CUDA and AMDGPU, falling back to copy_d2h for other backends.
---
 .../runtime/llvm/llvm_runtime_executor.cpp    | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index bc319f9c38..1fff73575b 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -922,9 +922,9 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
     void *bytecode_dev_ptr = get_device_alloc_info_ptr(*adstack_sizer_bytecode_alloc_);
     copy_h2d(bytecode_dev_ptr, bytecode.data(), bytecode_bytes);
 
-    // Invoke the device interpreter. On CUDA / AMDGPU `JITModule::call` launches this as a single-thread kernel
-    // on the default stream and stream-orders it before the subsequent main-kernel dispatch, so the writes we
-    // do here are visible by the time the user's kernel reads `adstack_max_sizes` etc.
+    // Invoke the device interpreter. `JITModule::call` launches this as a single-thread kernel on the active
+    // stream (CUDA/AMDGPU both dispatch through `{CUDA,AMDGPU}Context::launch` which uses `stream_`), so the
+    // writes are stream-ordered before the subsequent main-kernel dispatch.
     //
     // The sizer kernel dereferences `ctx->arg_buffer` on device (that's how it resolves `ExternalTensorRead` leaves
     // against ndarray pointers the caller packed into the arg buffer). AMDGPU always stages a device-side copy of
@@ -943,8 +943,27 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
                                               runtime_context_ptr_for_sizer, bytecode_dev_ptr);
 
     // Read back the computed per-thread stride so we can size the heap on host. One 8-byte `DtoH` per launch.
+    // Use async DtoH on active_stream + sync so the readback is ordered after the sizer kernel.
     uint64_t stride_u64 = 0;
-    copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
+#if defined(QD_WITH_AMDGPU)
+    if (config_.arch == Arch::amdgpu) {
+      void *active_stream = AMDGPUContext::get_instance().get_stream();
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_,
+                                                               sizeof(uint64_t), active_stream);
+      AMDGPUDriver::get_instance().stream_synchronize(active_stream);
+    } else
+#endif
+#if defined(QD_WITH_CUDA)
+        if (config_.arch == Arch::cuda) {
+      void *active_stream = CUDAContext::get_instance().get_stream();
+      CUDADriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_,
+                                                             sizeof(uint64_t), active_stream);
+      CUDADriver::get_instance().stream_synchronize(active_stream);
+    } else
+#endif
+    {
+      copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
+    }
     stride = static_cast<std::size_t>(stride_u64);
   }
 

From 7f0f29958c234668651c864fa999e696f4d3a895 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:51:43 -0700
Subject: [PATCH 18/28] Fix end-of-launcher sync: conditional + dealloc race on
 AMDGPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors 8550aa0 from the CUDA path:
1. Make stream_synchronize conditional — only sync when result_buffer
   or transfers need it, avoiding host-blocking on every launch.
2. Add sync between async DtoH and device memory deallocation to
   prevent race with non-blocking streams.
Also fixes black formatting from base branch merge.
---
 python/quadrants/lang/kernel.py              | 6 ++++--
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 8a1004c6a8..766689b02d 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -650,8 +650,10 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
         if qd_stream is not None and self.runtime.target_tape:
-            raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
-                               "context, or omit qd_stream.")
+            raise RuntimeError(
+                "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
+                "context, or omit qd_stream."
+            )
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 0c5b4bad05..b32e0981ea 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -214,12 +214,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
       AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
                                                                ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
+  } else if (ctx.result_buffer_size > 0) {
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
   AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream);
-  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From 84806cfdfdd3b5aa366745872429892fc37c2157 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 06:14:18 -0700
Subject: [PATCH 19/28] Fix NULL-stream DtoH races in synchronize() and
 allocate_llvm_runtime_memory_jit

synchronize() now drains the active user stream (if any) before the
NULL stream, so fetch_result_uint64 callers (lazy field-pointer
caches at three sites) read correct values when the runtime-query
kernel ran on a non-blocking user stream.

allocate_llvm_runtime_memory_jit: use async H2D on active_stream for
the zero-stamp and sync the active stream before the DtoH readback,
so the allocator kernel result is visible.
---
 quadrants/rhi/amdgpu/amdgpu_device.cpp           | 6 ++++--
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp
index 68c377a73a..d127ce19a0 100644
--- a/quadrants/rhi/amdgpu/amdgpu_device.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/rhi/amdgpu/amdgpu_device.h"
+#include "quadrants/rhi/amdgpu/amdgpu_context.h"
 #include "quadrants/rhi/llvm/device_memory_pool.h"
 
 #include "quadrants/jit/jit_module.h"
@@ -93,11 +94,12 @@ uint64_t *AmdgpuDevice::allocate_llvm_runtime_memory_jit(const LlvmRuntimeAllocP
   // the kernel without writing to *result. To detect that here, zero the slot first so a null readback unambiguously
   // means "allocation failed" and we can surface a helpful host-side message instead of letting the downstream
   // hipMemset trip on the stale pointer with a cryptic hipErrorInvalidValue.
+  void *active_stream = AMDGPUContext::get_instance().get_stream();
   uint64 zero = 0;
-  AMDGPUDriver::get_instance().memcpy_host_to_device(params.result_buffer, &zero, sizeof(uint64));
+  AMDGPUDriver::get_instance().memcpy_host_to_device_async(params.result_buffer, &zero, sizeof(uint64), active_stream);
   params.runtime_jit->call<void *, std::size_t, std::size_t>("runtime_memory_allocate_aligned", params.runtime,
                                                              params.size, quadrants_page_size, params.result_buffer);
-  AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   uint64 *ret{nullptr};
   AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, params.result_buffer, sizeof(uint64));
   QD_ERROR_IF(ret == nullptr,
diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 1fff73575b..390987768a 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -188,12 +188,20 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re
 void LlvmRuntimeExecutor::synchronize() {
   if (config_.arch == Arch::cuda) {
 #if defined(QD_WITH_CUDA)
+    auto *active_stream = CUDAContext::get_instance().get_stream();
+    if (active_stream != nullptr) {
+      CUDADriver::get_instance().stream_synchronize(active_stream);
+    }
     CUDADriver::get_instance().stream_synchronize(nullptr);
 #else
     QD_ERROR("No CUDA support");
 #endif
   } else if (config_.arch == Arch::amdgpu) {
 #if defined(QD_WITH_AMDGPU)
+    auto *active_stream = AMDGPUContext::get_instance().get_stream();
+    if (active_stream != nullptr) {
+      AMDGPUDriver::get_instance().stream_synchronize(active_stream);
+    }
     AMDGPUDriver::get_instance().stream_synchronize(nullptr);
 #else
     QD_ERROR("No AMDGPU support");

From ae1c932db2df45bdd0069e5c2a3b748a8b3d2128 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 07:17:21 -0700
Subject: [PATCH 20/28] Reflow comments and docstring to 120-char line width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++--
 quadrants/runtime/amdgpu/kernel_launcher.cpp       | 4 ++--
 tests/python/test_streams.py                       | 3 +--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index d91afcac00..0b789cedf5 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -28,8 +28,8 @@ PER_AMDGPU_FUNCTION(memcpy_async, hipMemcpyAsync, void *, void *, std::size_t, u
 PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, void *, std::size_t, void *);
 PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
-// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers
-// fall back to the synchronous variants on devices without memory-pool support.
+// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers fall back to the synchronous variants
+// on devices without memory-pool support.
 PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32);
 PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index b32e0981ea..67befa8b66 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -105,8 +105,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   auto *active_stream = AMDGPUContext::get_instance().get_stream();
 
   char *device_result_buffer{nullptr};
-  // Must always allocate device_result_buffer (even when result_buffer_size
-  // is 0) to avoid memory access faults from allocate_memory_on_device below.
+  // Must always allocate device_result_buffer (even when result_buffer_size is 0) to avoid memory access faults
+  // from allocate_memory_on_device below.
   AMDGPUDriver::get_instance().malloc_async((void **)&device_result_buffer,
                                             std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream);
 
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 073d383c2e..969d18ecf1 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -199,8 +199,7 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
 
 @test_utils.test()
 def test_concurrent_streams_with_events():
-    """Two slow kernels on separate streams run concurrently (~1s on GPU),
-    serial fallback on CPU/Metal."""
+    """Two slow kernels on separate streams run concurrently (~1s on GPU), serial fallback on CPU/Metal."""
     SPIN_ITERS = 5_000_000
 
     @qd.kernel

From 3ef0340bdbba610abfd400042a9617b7e0542f03 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 07:40:06 -0700
Subject: [PATCH 21/28] Use context/device synchronize in synchronize() to
 drain all streams

stream_synchronize(nullptr) does not drain non-blocking user streams
(CU_STREAM_NON_BLOCKING / HIP_STREAM_NON_BLOCKING), so qd.sync()
failed to honor its "drain everything" contract. Python's finally
block resets stream_ to nullptr before qd.sync() runs, making the
previous active-stream check dead code for the user-facing path.

Replace with cuCtxSynchronize (CUDA) / hipDeviceSynchronize (AMDGPU)
which drain all streams on the device, correctly implementing the
documented qd.sync() semantics.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h |  3 +++
 quadrants/rhi/cuda/cuda_driver_functions.inc.h     |  3 +++
 quadrants/runtime/llvm/llvm_runtime_executor.cpp   | 14 ++++----------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index 0b789cedf5..c94a7f14db 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -15,6 +15,9 @@ PER_AMDGPU_FUNCTION(context_create, hipCtxCreate, void *, int, void *);
 PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *);
 PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);
 
+// Device synchronization
+PER_AMDGPU_FUNCTION(device_synchronize, hipDeviceSynchronize);
+
 // Stream management
 PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32);
 PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);
diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
index 55c5e3e0b8..b4164b7c33 100644
--- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h
+++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -53,6 +53,9 @@ PER_CUDA_FUNCTION(kernel_get_occupancy, cuOccupancyMaxActiveBlocksPerMultiproces
 PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_attribute_enum, int);
 
 
+// Context management
+PER_CUDA_FUNCTION(context_synchronize, cuCtxSynchronize);
+
 // Stream management
 PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *);
 PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32);
diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 390987768a..6d631cfc2f 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -188,21 +188,15 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re
 void LlvmRuntimeExecutor::synchronize() {
   if (config_.arch == Arch::cuda) {
 #if defined(QD_WITH_CUDA)
-    auto *active_stream = CUDAContext::get_instance().get_stream();
-    if (active_stream != nullptr) {
-      CUDADriver::get_instance().stream_synchronize(active_stream);
-    }
-    CUDADriver::get_instance().stream_synchronize(nullptr);
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().context_synchronize();
 #else
     QD_ERROR("No CUDA support");
 #endif
   } else if (config_.arch == Arch::amdgpu) {
 #if defined(QD_WITH_AMDGPU)
-    auto *active_stream = AMDGPUContext::get_instance().get_stream();
-    if (active_stream != nullptr) {
-      AMDGPUDriver::get_instance().stream_synchronize(active_stream);
-    }
-    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+    AMDGPUContext::get_instance().make_current();
+    AMDGPUDriver::get_instance().device_synchronize();
 #else
     QD_ERROR("No AMDGPU support");
 #endif

From 3a81a46abcd5a53eea40df89e7283b4516479667 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 08:32:15 -0700
Subject: [PATCH 22/28] Use synchronous mem_free in dealloc_memory pool branch

mem_free_async on the NULL stream does not sync with non-blocking
user streams, so a Python ndarray dropped while a kernel is still
in flight could return its slab to the mempool prematurely. Using
synchronous mem_free matches pre-stream-rewire behavior and
implicitly waits for all pending work on the device.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp
index d127ce19a0..280cd9f7e1 100644
--- a/quadrants/rhi/amdgpu/amdgpu_device.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp
@@ -125,7 +125,7 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) {
   }
   QD_ASSERT(!info.is_imported);
   if (info.use_memory_pool) {
-    AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr);
+    AMDGPUDriver::get_instance().mem_free(info.ptr);
   } else if (info.use_cached) {
     DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/)
         .release(info.size, (uint64_t *)info.ptr, false);

From 3499bbcccef6f174cbc15649b0dcbd00eaf5c990 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 09:04:44 -0700
Subject: [PATCH 23/28] Thread active_stream through AMDGPU profiler
 event_record and sync

Profiler events were hardcoded to the NULL stream while kernels now
run on user streams; with HIP_STREAM_NON_BLOCKING both events signal
immediately on the empty NULL stream, yielding ~0 ms timings.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_profiler.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp
index 731d536bca..e963f7df20 100644
--- a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp
@@ -59,8 +59,9 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
 }
 
 void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
-  AMDGPUDriver::get_instance().event_record(handle, 0);
-  AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+  void *active_stream = AMDGPUContext::get_instance().get_stream();
+  AMDGPUDriver::get_instance().event_record(handle, active_stream);
+  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
 
   // get elapsed time and destroy events
   auto record = event_toolkit_->get_current_event_record();
@@ -154,7 +155,8 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std::
 
   AMDGPUDriver::get_instance().event_create(&(record.start_event), HIP_EVENT_DEFAULT);
   AMDGPUDriver::get_instance().event_create(&(record.stop_event), HIP_EVENT_DEFAULT);
-  AMDGPUDriver::get_instance().event_record((record.start_event), 0);
+  void *active_stream = AMDGPUContext::get_instance().get_stream();
+  AMDGPUDriver::get_instance().event_record((record.start_event), active_stream);
   event_records_.push_back(record);
 
   if (!base_event_) {
@@ -163,7 +165,7 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std::
     for (int i = 0; i < n_iters; i++) {
       void *e;
       AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT);
-      AMDGPUDriver::get_instance().event_record(e, 0);
+      AMDGPUDriver::get_instance().event_record(e, active_stream);
       AMDGPUDriver::get_instance().event_synchronize(e);
       auto final_t = Time::get_time();
       if (i == n_iters - 1) {

From 6e49c52d13f426dcac3c14b5b839059db2cb5839 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 03:12:15 -0700
Subject: [PATCH 24/28] Restore context_pointer free comment in AMDGPU kernel
 launcher

The comment explains a non-obvious race condition: context_pointer must be
freed directly (now via mem_free_async on active_stream) rather than through
AMDGPUContext's deferred free list, because that list is drained by
LlvmRuntimeExecutor::synchronize which can be called mid-launch.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index ab34003cbd..42db3934dd 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -288,6 +288,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   } else if (ctx.result_buffer_size > 0) {
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
+  // Free the per-launch `RuntimeContext` on the active stream rather than through `AMDGPUContext`'s deferred free
+  // list.  The deferred list is drained by `LlvmRuntimeExecutor::synchronize`, which is also called from
+  // `fetch_result_uint64` during `ensure_adstack_heap`'s field-pointer query -- that path would free
+  // `context_pointer` mid-launch, and HIP could recycle the address for the adstack heap allocated right after,
+  // clobbering the `RuntimeContext` the next task still reads from.
   AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream);
 }
 

From 1c81322cbe0e418a6deaa765d877a505d29ced16 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:29:18 -0700
Subject: [PATCH 25/28] Fix clang-format in program_stream.h

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
index ae6b7221d5..54a8e88d0b 100644
--- a/quadrants/program/program_stream.h
+++ b/quadrants/program/program_stream.h
@@ -11,7 +11,8 @@ namespace quadrants::lang {
 
 class StreamManager {
  public:
-  explicit StreamManager(Arch arch) : arch_(arch) {}
+  explicit StreamManager(Arch arch) : arch_(arch) {
+  }
 
   uint64 create_stream();
   void destroy_stream(uint64 stream_handle);

From d3317f5cf00e4955095edefdeab68227426243c5 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 06:01:11 -0700
Subject: [PATCH 26/28] Fix AMDGPU branches in StreamManager: use arch_ member
 instead of compile_config()

The base branch refactored stream/event methods from Program:: to
StreamManager::, which stores the arch in arch_. Our AMDGPU branches
still referenced compile_config().arch which is a Program method.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index 88288cc313..b4adc0226a 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -28,7 +28,7 @@ uint64 StreamManager::create_stream() {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     void *stream = nullptr;
     AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
@@ -46,7 +46,7 @@ void StreamManager::destroy_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+  if (arch_ == Arch::amdgpu && stream_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
@@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+  if (arch_ == Arch::amdgpu && stream_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
@@ -76,7 +76,7 @@ void StreamManager::set_current_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
@@ -93,7 +93,7 @@ uint64 StreamManager::create_event() {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     void *event = nullptr;
     AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/);
@@ -111,7 +111,7 @@ void StreamManager::destroy_event(uint64 event_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
@@ -127,7 +127,7 @@ void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                               reinterpret_cast<void *>(stream_handle));
@@ -143,7 +143,7 @@ void StreamManager::synchronize_event(uint64 event_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
@@ -159,7 +159,7 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle)
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                    reinterpret_cast<void *>(event_handle), 0 /*flags*/);

From b4450f7c1837e3fb603ddf267fb0a01a8f781154 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:19:23 -0700
Subject: [PATCH 27/28] Fix clang-format in export_stream.cpp

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/python/export_stream.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp
index f3f2fad525..66b3c8a3d7 100644
--- a/quadrants/python/export_stream.cpp
+++ b/quadrants/python/export_stream.cpp
@@ -10,8 +10,7 @@ namespace quadrants {
 
 void export_stream(py::module &m, py::class_<lang::Program> &program_class) {
   using lang::Program;
-  program_class
-      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+  program_class.def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
       .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
       .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
       .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })

From e8d9cf0413588ddfd1c51967407d53d8c657136e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 12:18:08 -0700
Subject: [PATCH 28/28] Allow synchronizing the default AMDGPU stream (handle
 0)

The stream_handle != 0 guard made synchronize_stream a no-op for the
default stream on AMDGPU, unlike the CUDA path. HIP supports
hipStreamSynchronize(nullptr), so remove the guard to match CUDA
semantics.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index 31fb12e76d..9686a86332 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (arch_ == Arch::amdgpu && stream_handle != 0) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }