From 7bd18ca4e1a9b6e99632af2c7c62076b4195ae3d Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 16:42:22 -0700 Subject: [PATCH 01/28] Add AMDGPU/HIP stream support and async memory operations Mirrors the CUDA stream implementation for HIP: adds stream_ member to AMDGPUContext, stream_destroy/stream_wait_event/malloc_async/ mem_free_async to HIP driver functions, and AMDGPU branches in all Program stream/event methods. Converts AMDGPU kernel launcher to use async memory operations through the active stream. CPU backend returns 0 handles (no-op). --- quadrants/program/program.cpp | 64 ++++++++++++++ quadrants/rhi/amdgpu/amdgpu_context.cpp | 4 +- quadrants/rhi/amdgpu/amdgpu_context.h | 9 ++ .../rhi/amdgpu/amdgpu_driver_functions.inc.h | 8 ++ quadrants/runtime/amdgpu/kernel_launcher.cpp | 51 +++++------ tests/python/test_streams.py | 84 ++++++++++++++++++- 6 files changed, 191 insertions(+), 29 deletions(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 9b2ff0886b..f4bb8da35b 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -25,6 +25,11 @@ #include "quadrants/rhi/cuda/cuda_context.h" #endif +#ifdef QD_WITH_AMDGPU +#include "quadrants/rhi/amdgpu/amdgpu_driver.h" +#include "quadrants/rhi/amdgpu/amdgpu_context.h" +#endif + #ifdef QD_WITH_VULKAN #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h" #include "quadrants/rhi/vulkan/vulkan_loader.h" @@ -493,6 +498,13 @@ uint64 Program::stream_create() { CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/); return reinterpret_cast(stream); } +#endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + void *stream = nullptr; + AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/); + return reinterpret_cast(stream); + } #endif return 0; } @@ -504,6 +516,12 @@ void Program::stream_destroy(uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + AMDGPUDriver::get_instance().stream_destroy( + reinterpret_cast(stream_handle)); + } +#endif } void Program::stream_synchronize(uint64 stream_handle) { @@ -513,6 +531,12 @@ void Program::stream_synchronize(uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + AMDGPUDriver::get_instance().stream_synchronize( + reinterpret_cast(stream_handle)); + } +#endif } void Program::set_current_cuda_stream(uint64 stream_handle) { @@ -522,6 +546,12 @@ void Program::set_current_cuda_stream(uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().set_stream( + reinterpret_cast(stream_handle)); + } +#endif } uint64 Program::event_create() { @@ -532,6 +562,14 @@ uint64 Program::event_create() { 0x02 /*CU_EVENT_DISABLE_TIMING*/); return reinterpret_cast(event); } +#endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + void *event = nullptr; + AMDGPUDriver::get_instance().event_create(&event, + 0x02 /*hipEventDisableTiming*/); + return reinterpret_cast(event); + } #endif return 0; } @@ -543,6 +581,12 @@ void Program::event_destroy(uint64 event_handle) { reinterpret_cast(event_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().event_destroy( + reinterpret_cast(event_handle)); + } +#endif } void Program::event_record(uint64 event_handle, uint64 stream_handle) { @@ -553,6 +597,13 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().event_record( + reinterpret_cast(event_handle), + reinterpret_cast(stream_handle)); + } +#endif } void Program::event_synchronize(uint64 event_handle) { @@ -562,6 +613,12 @@ void Program::event_synchronize(uint64 event_handle) { reinterpret_cast(event_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().event_synchronize( + reinterpret_cast(event_handle)); + } +#endif } void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { @@ -572,6 +629,13 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { reinterpret_cast(event_handle), 0 /*flags*/); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().stream_wait_event( + reinterpret_cast(stream_handle), + reinterpret_cast(event_handle), 0 /*flags*/); + } +#endif } } // namespace quadrants::lang diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp index 22f55339ee..f940ed9a7c 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp @@ -188,7 +188,7 @@ void AMDGPUContext::launch(void *func, void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02, (void *)&pack_size, (void *)0x03}; driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1, - dynamic_shared_mem_bytes, nullptr, nullptr, + dynamic_shared_mem_bytes, stream_, nullptr, reinterpret_cast(&config)); } std::free(packed_arg); @@ -197,7 +197,7 @@ void AMDGPUContext::launch(void *func, profiler_->stop(task_handle); if (debug_) { - driver_.stream_synchronize(nullptr); + driver_.stream_synchronize(stream_); } } diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index 9529953bf1..68e7cd7314 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -23,6 +23,7 @@ class AMDGPUContext { KernelProfilerBase *profiler_{nullptr}; AMDGPUDriver &driver_; bool debug_{false}; + void *stream_{nullptr}; std::vector kernel_arg_pointer_; public: @@ -116,6 +117,14 @@ class AMDGPUContext { return std::unique_lock(lock_); } + void set_stream(void *stream) { + stream_ = stream; + } + + void *get_stream() const { + return stream_; + } + static AMDGPUContext &get_instance(); }; diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index dbb3612c87..6063d268a9 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -26,6 +26,7 @@ PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **); // Stream management PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32); +PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *); // Memory management PER_AMDGPU_FUNCTION(memcpy_host_to_device, @@ -69,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); +PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, @@ -76,6 +78,7 @@ PER_AMDGPU_FUNCTION(malloc_managed, uint32); PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); PER_AMDGPU_FUNCTION(mem_free, hipFree, void *); +PER_AMDGPU_FUNCTION(mem_free_async, hipFreeAsync, void *, void *); PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *); PER_AMDGPU_FUNCTION(mem_get_attribute, hipPointerGetAttribute, @@ -121,6 +124,11 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy, // Stream management PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *); +PER_AMDGPU_FUNCTION(stream_wait_event, + hipStreamWaitEvent, + void *, + void *, + uint32); // Event management PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 6ef0b0e0e5..1d8430d35e 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -1,5 +1,6 @@ #include "quadrants/runtime/amdgpu/kernel_launcher.h" #include "quadrants/rhi/amdgpu/amdgpu_context.h" +#include "quadrants/rhi/amdgpu/amdgpu_driver.h" #include "quadrants/program/launch_context_builder.h" namespace quadrants::lang { @@ -32,18 +33,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, transfers; std::unordered_map device_ptrs; + auto *active_stream = AMDGPUContext::get_instance().get_stream(); + char *device_result_buffer{nullptr}; - // Here we have to guarantee the result_result_buffer isn't nullptr - // It is interesting - The code following - // L60: DeviceAllocation devalloc = - // executor->allocate_memory_on_device( call another kernel and it will result - // in - // Memory access fault by GPU node-1 (Agent handle: 0xeda5ca0) on address - // (nil). Reason: Page not present or supervisor privilege. - // if you don't allocate it. - AMDGPUDriver::get_instance().malloc( + // Must always allocate device_result_buffer (even when result_buffer_size + // is 0) to avoid memory access faults from allocate_memory_on_device below. + AMDGPUDriver::get_instance().malloc_async( (void **)&device_result_buffer, - std::max(ctx.result_buffer_size, sizeof(uint64))); + std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream); for (int i = 0; i < (int)parameters.size(); i++) { const auto &kv = parameters[i]; @@ -86,27 +83,28 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } if (transfers.size() > 0) { - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } char *host_result_buffer = (char *)ctx.get_context().result_buffer; if (ctx.result_buffer_size > 0) { - // Malloc_Async and Free_Async are available after ROCm 5.4 ctx.get_context().result_buffer = (uint64 *)device_result_buffer; } char *device_arg_buffer = nullptr; if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().malloc((void **)&device_arg_buffer, - ctx.arg_buffer_size); - AMDGPUDriver::get_instance().memcpy_host_to_device( - device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size); + AMDGPUDriver::get_instance().malloc_async( + (void **)&device_arg_buffer, ctx.arg_buffer_size, active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async( + device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size, + active_stream); ctx.get_context().arg_buffer = device_arg_buffer; } void *context_pointer; int arg_size = sizeof(RuntimeContext *); - AMDGPUDriver::get_instance().malloc((void **)&context_pointer, - sizeof(RuntimeContext)); - AMDGPUDriver::get_instance().memcpy_host_to_device( - context_pointer, &ctx.get_context(), sizeof(RuntimeContext)); + AMDGPUDriver::get_instance().malloc_async( + (void **)&context_pointer, sizeof(RuntimeContext), active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async( + context_pointer, &ctx.get_context(), sizeof(RuntimeContext), + active_stream); AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer); @@ -119,13 +117,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } QD_TRACE("Launching kernel"); if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().mem_free(device_arg_buffer); + AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, + active_stream); } if (ctx.result_buffer_size > 0) { - AMDGPUDriver::get_instance().memcpy_device_to_host( - host_result_buffer, device_result_buffer, ctx.result_buffer_size); + AMDGPUDriver::get_instance().memcpy_device_to_host_async( + host_result_buffer, device_result_buffer, ctx.result_buffer_size, + active_stream); } if (transfers.size()) { + AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; auto arg_id = idx.arg_id; @@ -135,8 +136,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->deallocate_memory_on_device(itr->second.second); } } - // Since we always allocating above then we should always free - AMDGPUDriver::get_instance().mem_free(device_result_buffer); + AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, + active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index fabc217e96..073d383c2e 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -8,7 +8,7 @@ from tests import test_utils -@test_utils.test(arch=[qd.cuda]) +@test_utils.test(arch=[qd.cuda, qd.amdgpu]) def test_create_and_destroy_stream(): s = qd.create_stream() assert isinstance(s, Stream) @@ -17,7 +17,7 @@ def test_create_and_destroy_stream(): assert s.handle == 0 -@test_utils.test(arch=[qd.cuda]) +@test_utils.test(arch=[qd.cuda, qd.amdgpu]) def test_create_and_destroy_event(): e = qd.create_event() assert isinstance(e, Event) @@ -195,3 +195,83 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): s.synchronize() assert np.allclose(arr.to_numpy(), 99.0) s.destroy() + + +@test_utils.test() +def test_concurrent_streams_with_events(): + """Two slow kernels on separate streams run concurrently (~1s on GPU), + serial fallback on CPU/Metal.""" + SPIN_ITERS = 5_000_000 + + @qd.kernel + def slow_fill( + a: qd.types.ndarray(dtype=qd.f32, ndim=1), + lcg_state: qd.types.ndarray(dtype=qd.i32, ndim=1), + index: qd.i32, + value: qd.f32, + ): + qd.loop_config(block_dim=1) + for _ in range(1): + x = lcg_state[index] + for _j in range(SPIN_ITERS): + x = (1664525 * x + 1013904223) % 2147483647 + lcg_state[index] = x + a[index] = value + + @qd.kernel + def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): + qd.loop_config(block_dim=1) + for _ in range(1): + a[2] = a[0] + a[1] + + import time + + # Warm up JIT + a_warmup = qd.ndarray(qd.f32, shape=(3,)) + lcg_warmup = qd.ndarray(qd.i32, shape=(3,)) + slow_fill(a_warmup, lcg_warmup, 0, 0.0) + add_first_two(a_warmup) + qd.sync() + + # Serial baseline + a = qd.ndarray(qd.f32, shape=(3,)) + lcg = qd.ndarray(qd.i32, shape=(3,)) + qd.sync() + t0 = time.perf_counter() + slow_fill(a, lcg, 0, 5.0) + slow_fill(a, lcg, 1, 7.0) + add_first_two(a) + qd.sync() + serial_time = time.perf_counter() - t0 + assert np.isclose(a.to_numpy()[2], 12.0) + + # Streams + a = qd.ndarray(qd.f32, shape=(3,)) + lcg = qd.ndarray(qd.i32, shape=(3,)) + s1 = qd.create_stream() + s2 = qd.create_stream() + e1 = qd.create_event() + e2 = qd.create_event() + qd.sync() + t0 = time.perf_counter() + slow_fill(a, lcg, 0, 5.0, qd_stream=s1) + slow_fill(a, lcg, 1, 7.0, qd_stream=s2) + e1.record(s1) + e2.record(s2) + e1.wait() + e2.wait() + add_first_two(a) + qd.sync() + stream_time = time.perf_counter() - t0 + assert np.isclose(a.to_numpy()[2], 12.0) + + speedup = serial_time / stream_time + if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu): + assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x" + else: + assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x" + + s1.destroy() + s2.destroy() + e1.destroy() + e2.destroy() From 7555ec5edf0581290df8b902b5a31e6162521fe3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:03 -0700 Subject: [PATCH 02/28] Move AMDGPU mem_free_async before transfers sync to match CUDA ordering Batch the device_result_buffer free into the stream pipeline before the sync barrier, matching the CUDA kernel launcher's ordering for consistency and marginal performance improvement. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 1d8430d35e..cff0f2b4a1 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -125,6 +125,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, host_result_buffer, device_result_buffer, ctx.result_buffer_size, active_stream); } + AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, + active_stream); if (transfers.size()) { AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { @@ -136,8 +138,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->deallocate_memory_on_device(itr->second.second); } } - AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, - active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( From c12d23e1e1426a0b538382cb5dcab489e4c09b2e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:18 -0700 Subject: [PATCH 03/28] Convert AMDGPU sync memcpy_host_to_device to async on active_stream Use memcpy_host_to_device_async for external array transfers so they are properly ordered on the active stream, matching the CUDA launcher. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index cff0f2b4a1..f772fc7b5b 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -66,8 +66,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->get_device_alloc_info_ptr(devalloc); transfers[data_ptr_idx] = {data_ptr, devalloc}; - AMDGPUDriver::get_instance().memcpy_host_to_device( - (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz); + AMDGPUDriver::get_instance().memcpy_host_to_device_async( + (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, + active_stream); } ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx], (uint64)ctx.array_ptrs[grad_ptr_idx]); From 1673a38761b50fb6af4767e569fbf88751bb4788 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:25 -0700 Subject: [PATCH 04/28] Document ROCm >= 5.4 requirement for hipMallocAsync/hipFreeAsync --- quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index 6063d268a9..25e33774e7 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -70,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); +// hipMallocAsync/hipFreeAsync require ROCm >= 5.4 PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, From 60d015bfddac7068d1d1067d8f059e9c3236447e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:35 -0700 Subject: [PATCH 05/28] Relax concurrency test threshold and log timings Lower GPU speedup threshold from 1.5x to 1.3x to reduce flakiness in CI under contention, and print actual timings for diagnostics. --- tests/python/test_streams.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 073d383c2e..236578974d 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -266,8 +266,9 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): assert np.isclose(a.to_numpy()[2], 12.0) speedup = serial_time / stream_time + print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x") if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu): - assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x" + assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x" else: assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x" From c4be4ffd7c77a68ed6176ce30900d1a2260dec5b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:55 -0700 Subject: [PATCH 06/28] Add handle==0 guard to AMDGPU stream_synchronize and make stream_ thread_local Mirror the CUDA fixes: guard stream_synchronize against handle==0 to avoid unintentional default stream sync, and make AMDGPUContext::stream_ thread_local for thread-safety. --- quadrants/program/program.cpp | 2 +- quadrants/rhi/amdgpu/amdgpu_context.cpp | 2 ++ quadrants/rhi/amdgpu/amdgpu_context.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index faac67970c..8bab1d30f7 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -532,7 +532,7 @@ void Program::stream_synchronize(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { AMDGPUDriver::get_instance().stream_synchronize( reinterpret_cast(stream_handle)); } diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp index f940ed9a7c..24d924ed0d 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp @@ -13,6 +13,8 @@ namespace quadrants { namespace lang { +thread_local void *AMDGPUContext::stream_ = nullptr; + AMDGPUContext::AMDGPUContext() : driver_(AMDGPUDriver::get_instance_without_context()) { dev_count_ = 0; diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index 68e7cd7314..4fc7c8328b 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -23,7 +23,7 @@ class AMDGPUContext { KernelProfilerBase *profiler_{nullptr}; AMDGPUDriver &driver_; bool debug_{false}; - void *stream_{nullptr}; + static thread_local void *stream_; std::vector kernel_arg_pointer_; public: From b28e7c60901fdde76ff2b9ea153534f15a0050ac Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 18:23:15 -0700 Subject: [PATCH 07/28] Revert "Relax concurrency test threshold and log timings" This reverts commit 60d015bfddac7068d1d1067d8f059e9c3236447e. --- tests/python/test_streams.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 236578974d..073d383c2e 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -266,9 +266,8 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): assert np.isclose(a.to_numpy()[2], 12.0) speedup = serial_time / stream_time - print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x") if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu): - assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x" + assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x" else: assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x" From 31fffbf1730e32c200eed37e8b4a4740ddc28b50 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 19 Apr 2026 19:03:53 -0700 Subject: [PATCH 08/28] Apply clang-format Made-with: Cursor --- quadrants/program/program.cpp | 28 +++++++------------ .../rhi/amdgpu/amdgpu_driver_functions.inc.h | 6 +--- quadrants/runtime/amdgpu/kernel_launcher.cpp | 18 ++++++------ 3 files changed, 19 insertions(+), 33 deletions(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 43e8df1236..648f3291c3 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -491,8 +491,7 @@ void Program::stream_destroy(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { - AMDGPUDriver::get_instance().stream_destroy( - reinterpret_cast(stream_handle)); + AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } #endif } @@ -505,8 +504,7 @@ void Program::stream_synchronize(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { - AMDGPUDriver::get_instance().stream_synchronize( - reinterpret_cast(stream_handle)); + AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } #endif } @@ -519,8 +517,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { - AMDGPUContext::get_instance().set_stream( - reinterpret_cast(stream_handle)); + AMDGPUContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } #endif } @@ -536,8 +533,7 @@ uint64 Program::event_create() { #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { void *event = nullptr; - AMDGPUDriver::get_instance().event_create(&event, - 0x02 /*hipEventDisableTiming*/); + AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/); return reinterpret_cast(event); } #endif @@ -552,8 +548,7 @@ void Program::event_destroy(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().event_destroy( - reinterpret_cast(event_handle)); + AMDGPUDriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } #endif } @@ -567,9 +562,8 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().event_record( - reinterpret_cast(event_handle), - reinterpret_cast(stream_handle)); + AMDGPUDriver::get_instance().event_record(reinterpret_cast(event_handle), + reinterpret_cast(stream_handle)); } #endif } @@ -582,8 +576,7 @@ void Program::event_synchronize(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().event_synchronize( - reinterpret_cast(event_handle)); + AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } #endif } @@ -597,9 +590,8 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().stream_wait_event( - reinterpret_cast(stream_handle), - reinterpret_cast(event_handle), 0 /*flags*/); + AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), + reinterpret_cast(event_handle), 0 /*flags*/); } #endif } diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index 6a01c3a87a..6be39db108 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -62,11 +62,7 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy, hipOccupancyMaxActiveBlocksPerMultipro // Stream management PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *); -PER_AMDGPU_FUNCTION(stream_wait_event, - hipStreamWaitEvent, - void *, - void *, - uint32); +PER_AMDGPU_FUNCTION(stream_wait_event, hipStreamWaitEvent, void *, void *, uint32); // Event management PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 1c5c573d85..cace0821ce 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -86,16 +86,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(devalloc); transfers[data_ptr_idx] = {data_ptr, devalloc}; - AMDGPUDriver::get_instance().memcpy_host_to_device_async( - (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, + active_stream); if (grad_ptr != nullptr) { DeviceAllocation grad_devalloc = executor->allocate_memory_on_device(arr_sz, (uint64 *)device_result_buffer); device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(grad_devalloc); transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc}; - AMDGPUDriver::get_instance().memcpy_host_to_device_async( - (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz, active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[grad_ptr_idx], grad_ptr, + arr_sz, active_stream); } else { device_ptrs[grad_ptr_idx] = nullptr; } @@ -141,8 +141,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx void *context_pointer; int arg_size = sizeof(RuntimeContext *); AMDGPUDriver::get_instance().malloc_async((void **)&context_pointer, sizeof(RuntimeContext), active_stream); - AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), - sizeof(RuntimeContext), active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext), + active_stream); AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer); @@ -154,15 +154,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx } QD_TRACE("Launching kernel"); if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, - active_stream); + AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, active_stream); } if (ctx.result_buffer_size > 0) { AMDGPUDriver::get_instance().memcpy_device_to_host_async(host_result_buffer, device_result_buffer, ctx.result_buffer_size, active_stream); } - AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, - active_stream); + AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, active_stream); if (transfers.size()) { AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { From 798f87a18139fb8799d9b1d91135b2f6b8066a8d Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 24 Apr 2026 04:55:29 -0700 Subject: [PATCH 09/28] Exclude flaky test_perf_dispatch_python from Metal and Vulkan The pure-Python perf_dispatch timing test is unreliable on Mac Metal and Vulkan (MoltenVK) where timing differences between implementations are too small to consistently pick the fastest one. Made-with: Cursor --- tests/python/test_perf_dispatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/test_perf_dispatch.py b/tests/python/test_perf_dispatch.py index eaef03d99f..2de074ed3c 100644 --- a/tests/python/test_perf_dispatch.py +++ b/tests/python/test_perf_dispatch.py @@ -109,7 +109,7 @@ def my_func1_impl_a_shape0_ge_2( assert len(speed_checker._trial_count_by_dispatch_impl_by_geometry_hash[geometry]) == 2 -@test_utils.test() +@test_utils.test(exclude=[qd.metal, qd.vulkan]) def test_perf_dispatch_python() -> None: WARMUP = 3 From 22389690c487e1bc05da15ed213b7e2f7bb0d7ed Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 28 Apr 2026 08:43:36 -0700 Subject: [PATCH 10/28] [Doc] Update streams doc with AMDGPU support --- docs/source/user_guide/streams.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 0a610fd217..cd26e01d20 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -9,6 +9,7 @@ and control synchronization with events. | Backend | Streams | Events | Notes | |---------|---------|--------|-------| | CUDA | Yes | Yes | Full concurrent execution | +| AMDGPU | Yes | Yes | Full concurrent execution (requires ROCm >= 5.4) | | CPU | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | | Metal | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | | Vulkan | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | From 8efd51f116d3825d152ee67bfbb2430a5ee25d6b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:04:18 -0700 Subject: [PATCH 11/28] Address review comments: fix AMDGPU stream issues - Fix stream_synchronize(nullptr) in do-while loop to sync active stream, mirroring the CUDA path (claude red) - Remove unused kernel_arg_pointer_ member from AMDGPUContext (claude yellow) - Reword misleading ROCm fallback comment to clarify it's per-device, not per-runtime-version (claude yellow) - Fix stream_create ABI: bind to hipStreamCreateWithFlags instead of hipStreamCreate to match the two-arg call signature (codex P2) --- quadrants/rhi/amdgpu/amdgpu_context.h | 1 - quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++-- quadrants/runtime/amdgpu/kernel_launcher.cpp | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index b9fd5c403c..083406c3f9 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -25,7 +25,6 @@ class AMDGPUContext { bool debug_{false}; bool supports_mem_pool_{false}; static thread_local void *stream_; - std::vector kernel_arg_pointer_; public: AMDGPUContext(); diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index b6a4d7ba3e..d91afcac00 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -16,7 +16,7 @@ PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *); PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **); // Stream management -PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32); +PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32); PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *); // Memory management @@ -29,7 +29,7 @@ PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, voi PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); // hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers -// transparently fall back to the synchronous variants when unsupported. +// fall back to the synchronous variants on devices without memory-pool support. PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32); PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 5bb5e70194..d54331f237 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -71,7 +71,8 @@ void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder & do { launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks, context_pointer, arg_size); counter_val = 0; - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + auto *stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().stream_synchronize(stream); AMDGPUDriver::get_instance().memcpy_device_to_host(&counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t)); } while (counter_val != 0); } From 34e9fa6aa47672ad4a59d2d2d4e952b1aec66698 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:14:01 -0700 Subject: [PATCH 12/28] Use HIP_STREAM_NON_BLOCKING for AMDGPU stream_create to mirror CUDA path --- quadrants/program/program.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 36c27942d0..f3fdeef548 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -510,7 +510,7 @@ uint64 Program::stream_create() { #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { void *stream = nullptr; - AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/); + AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/); return reinterpret_cast(stream); } #endif From 162239e38cbd9ce3fcd1365181c1f3470be194d8 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:35:34 -0700 Subject: [PATCH 13/28] Use active stream for AMDGPU adstack metadata copies in publish_adstack_metadata AMDGPUContext::launch now dispatches on the user stream, so the adstack H2D copies must target the same stream to maintain ordering. Mirrors the CUDA branch. --- quadrants/runtime/llvm/llvm_runtime_executor.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 69be9408b5..bc319f9c38 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -851,11 +851,10 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf std::memcpy(pinned + 1 + n_stacks, host_max_sizes.data(), array_bytes); // Queue the metadata copies on the same stream the subsequent main-kernel dispatch will run on, so the - // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. On CUDA the active - // stream is `CUDAContext::get_instance().get_stream()` - configurable via `set_stream`, defaults to the - // null stream - and `CUDAContext::launch` dispatches kernels on the same handle. AMDGPU has no - // public stream-selection API: `AMDGPUContext::launch` always passes `nullptr` to `hipLaunchKernel` - // (i.e. the default stream), so the copies match that. + // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. Both CUDA and AMDGPU + // fetch the active stream from their respective context singletons (configurable via `set_stream`, + // defaults to the null stream), matching the stream used by `CUDAContext::launch` / + // `AMDGPUContext::launch`. #if defined(QD_WITH_CUDA) if (config_.arch == Arch::cuda) { void *active_stream = CUDAContext::get_instance().get_stream(); @@ -869,7 +868,7 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf #endif #if defined(QD_WITH_AMDGPU) if (config_.arch == Arch::amdgpu) { - void *active_stream = nullptr; // AMDGPUContext::launch always uses the default stream. + void *active_stream = AMDGPUContext::get_instance().get_stream(); AMDGPUDriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned, header_bytes, active_stream); AMDGPUDriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes, From 9334efd4f102def5c5458e7ccd0a99f63e80d63e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:46:42 -0700 Subject: [PATCH 14/28] Add make_current() to all AMDGPU stream/event Program methods Mirrors commit 8b3d4ed from the CUDA path: HIP uses the same primary-context-per-thread model, so calling these methods from a non-init thread requires make_current() to bind the context first. --- quadrants/program/program.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 89972bdf6f..2c9e57e378 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -510,6 +510,7 @@ uint64 Program::stream_create() { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().make_current(); void *stream = nullptr; AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/); return reinterpret_cast(stream); @@ -527,6 +528,7 @@ void Program::stream_destroy(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } #endif @@ -541,6 +543,7 @@ void Program::stream_synchronize(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } #endif @@ -555,6 +558,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().make_current(); AMDGPUContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } #endif @@ -571,6 +575,7 @@ uint64 Program::event_create() { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().make_current(); void *event = nullptr; AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/); return reinterpret_cast(event); @@ -588,6 +593,7 @@ void Program::event_destroy(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } #endif @@ -603,6 +609,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_record(reinterpret_cast(event_handle), reinterpret_cast(stream_handle)); } @@ -618,6 +625,7 @@ void Program::event_synchronize(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } #endif @@ -633,6 +641,7 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), reinterpret_cast(event_handle), 0 /*flags*/); } From 1fba4f56f6a0a2a276ffb7bd23c2d8a6374fde6b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:57:54 -0700 Subject: [PATCH 15/28] Use async DtoH on active_stream for AMDGPU resolve_num_threads readback Mirrors aa4a70f from the CUDA path: with non-blocking user streams, synchronous DtoH on the NULL stream has no ordering with the prep task's store on active_stream, risking stale begin/end values. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index d54331f237..bb19087586 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -25,15 +25,17 @@ std::size_t resolve_num_threads(const OffloadedTask &task, LlvmRuntimeExecutor * std::int32_t begin = info.begin_const_value; std::int32_t end = info.end_const_value; if (info.begin_offset_bytes >= 0 || info.end_offset_bytes >= 0) { + auto *active_stream = AMDGPUContext::get_instance().get_stream(); auto *temp_dev_ptr = reinterpret_cast(executor->get_runtime_temporaries_device_ptr()); if (info.begin_offset_bytes >= 0) { - AMDGPUDriver::get_instance().memcpy_device_to_host(&begin, temp_dev_ptr + info.begin_offset_bytes, - sizeof(std::int32_t)); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(&begin, temp_dev_ptr + info.begin_offset_bytes, + sizeof(std::int32_t), active_stream); } if (info.end_offset_bytes >= 0) { - AMDGPUDriver::get_instance().memcpy_device_to_host(&end, temp_dev_ptr + info.end_offset_bytes, - sizeof(std::int32_t)); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(&end, temp_dev_ptr + info.end_offset_bytes, + sizeof(std::int32_t), active_stream); } + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } // Clamp the logical iteration count to the launched thread count: adstack slices are indexed by // `linear_thread_idx()`, so only `static_num_threads = grid_dim * block_dim` slices can be touched From f89bde02c5497856745bc93dd73fd2825ad2d489 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:19:16 -0700 Subject: [PATCH 16/28] Sync active_stream unconditionally at end of AMDGPU launch_llvm_kernel Mirrors 5901a7fc from the CUDA path: when transfers is empty, the result-buffer DtoH and mem_free_async were left in-flight on a non-blocking stream with no sync before return. Also converts transfer DtoH copies to async to match CUDA. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index bb19087586..0c5b4bad05 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -211,13 +211,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; - auto arg_id = idx.arg_id; - AMDGPUDriver::get_instance().memcpy_device_to_host(itr->second.first, (void *)device_ptrs[idx], - ctx.array_runtime_sizes[arg_id]); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx], + ctx.array_runtime_sizes[idx.arg_id], active_stream); + } + for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { executor->deallocate_memory_on_device(itr->second.second); } } AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) { From ef3b95b18361dce692b02e4beff5a0a496fb5ff3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:30:20 -0700 Subject: [PATCH 17/28] Use async DtoH on active_stream for sizer stride readback The sizer kernel now runs on the user stream via Context::launch, so the synchronous copy_d2h (NULL stream) can read stale stride values with non-blocking streams. Use stream-aware async DtoH + sync for both CUDA and AMDGPU, falling back to copy_d2h for other backends. --- .../runtime/llvm/llvm_runtime_executor.cpp | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index bc319f9c38..1fff73575b 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -922,9 +922,9 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf void *bytecode_dev_ptr = get_device_alloc_info_ptr(*adstack_sizer_bytecode_alloc_); copy_h2d(bytecode_dev_ptr, bytecode.data(), bytecode_bytes); - // Invoke the device interpreter. On CUDA / AMDGPU `JITModule::call` launches this as a single-thread kernel - // on the default stream and stream-orders it before the subsequent main-kernel dispatch, so the writes we - // do here are visible by the time the user's kernel reads `adstack_max_sizes` etc. + // Invoke the device interpreter. `JITModule::call` launches this as a single-thread kernel on the active + // stream (CUDA/AMDGPU both dispatch through `{CUDA,AMDGPU}Context::launch` which uses `stream_`), so the + // writes are stream-ordered before the subsequent main-kernel dispatch. // // The sizer kernel dereferences `ctx->arg_buffer` on device (that's how it resolves `ExternalTensorRead` leaves // against ndarray pointers the caller packed into the arg buffer). AMDGPU always stages a device-side copy of @@ -943,8 +943,27 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf runtime_context_ptr_for_sizer, bytecode_dev_ptr); // Read back the computed per-thread stride so we can size the heap on host. One 8-byte `DtoH` per launch. + // Use async DtoH on active_stream + sync so the readback is ordered after the sizer kernel. uint64_t stride_u64 = 0; - copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t)); +#if defined(QD_WITH_AMDGPU) + if (config_.arch == Arch::amdgpu) { + void *active_stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_, + sizeof(uint64_t), active_stream); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); + } else +#endif +#if defined(QD_WITH_CUDA) + if (config_.arch == Arch::cuda) { + void *active_stream = CUDAContext::get_instance().get_stream(); + CUDADriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_, + sizeof(uint64_t), active_stream); + CUDADriver::get_instance().stream_synchronize(active_stream); + } else +#endif + { + copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t)); + } stride = static_cast(stride_u64); } From 7f0f29958c234668651c864fa999e696f4d3a895 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:51:43 -0700 Subject: [PATCH 18/28] Fix end-of-launcher sync: conditional + dealloc race on AMDGPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors 8550aa0 from the CUDA path: 1. Make stream_synchronize conditional — only sync when result_buffer or transfers need it, avoiding host-blocking on every launch. 2. Add sync between async DtoH and device memory deallocation to prevent race with non-blocking streams. Also fixes black formatting from base branch merge. --- python/quadrants/lang/kernel.py | 6 ++++-- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py index 8a1004c6a8..766689b02d 100644 --- a/python/quadrants/lang/kernel.py +++ b/python/quadrants/lang/kernel.py @@ -650,8 +650,10 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut def __call__(self, *py_args, **kwargs) -> Any: qd_stream = kwargs.pop("qd_stream", None) if qd_stream is not None and self.runtime.target_tape: - raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " - "context, or omit qd_stream.") + raise RuntimeError( + "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " + "context, or omit qd_stream." + ) if impl.get_runtime()._arch == _ARCH_PYTHON: return self.func(*py_args, **kwargs) config = impl.current_cfg() diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 0c5b4bad05..b32e0981ea 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -214,12 +214,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx], ctx.array_runtime_sizes[idx.arg_id], active_stream); } + AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { executor->deallocate_memory_on_device(itr->second.second); } + } else if (ctx.result_buffer_size > 0) { + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream); - AMDGPUDriver::get_instance().stream_synchronize(active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) { From 84806cfdfdd3b5aa366745872429892fc37c2157 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 06:14:18 -0700 Subject: [PATCH 19/28] Fix NULL-stream DtoH races in synchronize() and allocate_llvm_runtime_memory_jit synchronize() now drains the active user stream (if any) before the NULL stream, so fetch_result_uint64 callers (lazy field-pointer caches at three sites) read correct values when the runtime-query kernel ran on a non-blocking user stream. allocate_llvm_runtime_memory_jit: use async H2D on active_stream for the zero-stamp and sync the active stream before the DtoH readback, so the allocator kernel result is visible. --- quadrants/rhi/amdgpu/amdgpu_device.cpp | 6 ++++-- quadrants/runtime/llvm/llvm_runtime_executor.cpp | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp index 68c377a73a..d127ce19a0 100644 --- a/quadrants/rhi/amdgpu/amdgpu_device.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp @@ -1,4 +1,5 @@ #include "quadrants/rhi/amdgpu/amdgpu_device.h" +#include "quadrants/rhi/amdgpu/amdgpu_context.h" #include "quadrants/rhi/llvm/device_memory_pool.h" #include "quadrants/jit/jit_module.h" @@ -93,11 +94,12 @@ uint64_t *AmdgpuDevice::allocate_llvm_runtime_memory_jit(const LlvmRuntimeAllocP // the kernel without writing to *result. To detect that here, zero the slot first so a null readback unambiguously // means "allocation failed" and we can surface a helpful host-side message instead of letting the downstream // hipMemset trip on the stale pointer with a cryptic hipErrorInvalidValue. + void *active_stream = AMDGPUContext::get_instance().get_stream(); uint64 zero = 0; - AMDGPUDriver::get_instance().memcpy_host_to_device(params.result_buffer, &zero, sizeof(uint64)); + AMDGPUDriver::get_instance().memcpy_host_to_device_async(params.result_buffer, &zero, sizeof(uint64), active_stream); params.runtime_jit->call("runtime_memory_allocate_aligned", params.runtime, params.size, quadrants_page_size, params.result_buffer); - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); uint64 *ret{nullptr}; AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, params.result_buffer, sizeof(uint64)); QD_ERROR_IF(ret == nullptr, diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 1fff73575b..390987768a 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -188,12 +188,20 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re void LlvmRuntimeExecutor::synchronize() { if (config_.arch == Arch::cuda) { #if defined(QD_WITH_CUDA) + auto *active_stream = CUDAContext::get_instance().get_stream(); + if (active_stream != nullptr) { + CUDADriver::get_instance().stream_synchronize(active_stream); + } CUDADriver::get_instance().stream_synchronize(nullptr); #else QD_ERROR("No CUDA support"); #endif } else if (config_.arch == Arch::amdgpu) { #if defined(QD_WITH_AMDGPU) + auto *active_stream = AMDGPUContext::get_instance().get_stream(); + if (active_stream != nullptr) { + AMDGPUDriver::get_instance().stream_synchronize(active_stream); + } AMDGPUDriver::get_instance().stream_synchronize(nullptr); #else QD_ERROR("No AMDGPU support"); From ae1c932db2df45bdd0069e5c2a3b748a8b3d2128 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 07:17:21 -0700 Subject: [PATCH 20/28] Reflow comments and docstring to 120-char line width Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++-- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++-- tests/python/test_streams.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index d91afcac00..0b789cedf5 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -28,8 +28,8 @@ PER_AMDGPU_FUNCTION(memcpy_async, hipMemcpyAsync, void *, void *, std::size_t, u PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, void *, std::size_t, void *); PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); -// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers -// fall back to the synchronous variants on devices without memory-pool support. +// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers fall back to the synchronous variants +// on devices without memory-pool support. PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32); PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index b32e0981ea..67befa8b66 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -105,8 +105,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx auto *active_stream = AMDGPUContext::get_instance().get_stream(); char *device_result_buffer{nullptr}; - // Must always allocate device_result_buffer (even when result_buffer_size - // is 0) to avoid memory access faults from allocate_memory_on_device below. + // Must always allocate device_result_buffer (even when result_buffer_size is 0) to avoid memory access faults + // from allocate_memory_on_device below. AMDGPUDriver::get_instance().malloc_async((void **)&device_result_buffer, std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream); diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 073d383c2e..969d18ecf1 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -199,8 +199,7 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): @test_utils.test() def test_concurrent_streams_with_events(): - """Two slow kernels on separate streams run concurrently (~1s on GPU), - serial fallback on CPU/Metal.""" + """Two slow kernels on separate streams run concurrently (~1s on GPU), serial fallback on CPU/Metal.""" SPIN_ITERS = 5_000_000 @qd.kernel From 3ef0340bdbba610abfd400042a9617b7e0542f03 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 07:40:06 -0700 Subject: [PATCH 21/28] Use context/device synchronize in synchronize() to drain all streams stream_synchronize(nullptr) does not drain non-blocking user streams (CU_STREAM_NON_BLOCKING / HIP_STREAM_NON_BLOCKING), so qd.sync() failed to honor its "drain everything" contract. Python's finally block resets stream_ to nullptr before qd.sync() runs, making the previous active-stream check dead code for the user-facing path. Replace with cuCtxSynchronize (CUDA) / hipDeviceSynchronize (AMDGPU) which drain all streams on the device, correctly implementing the documented qd.sync() semantics. Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 3 +++ quadrants/rhi/cuda/cuda_driver_functions.inc.h | 3 +++ quadrants/runtime/llvm/llvm_runtime_executor.cpp | 14 ++++---------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index 0b789cedf5..c94a7f14db 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -15,6 +15,9 @@ PER_AMDGPU_FUNCTION(context_create, hipCtxCreate, void *, int, void *); PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *); PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **); +// Device synchronization +PER_AMDGPU_FUNCTION(device_synchronize, hipDeviceSynchronize); + // Stream management PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32); PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *); diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h index 55c5e3e0b8..b4164b7c33 100644 --- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h +++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h @@ -53,6 +53,9 @@ PER_CUDA_FUNCTION(kernel_get_occupancy, cuOccupancyMaxActiveBlocksPerMultiproces PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_attribute_enum, int); +// Context management +PER_CUDA_FUNCTION(context_synchronize, cuCtxSynchronize); + // Stream management PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *); PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32); diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 390987768a..6d631cfc2f 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -188,21 +188,15 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re void LlvmRuntimeExecutor::synchronize() { if (config_.arch == Arch::cuda) { #if defined(QD_WITH_CUDA) - auto *active_stream = CUDAContext::get_instance().get_stream(); - if (active_stream != nullptr) { - CUDADriver::get_instance().stream_synchronize(active_stream); - } - CUDADriver::get_instance().stream_synchronize(nullptr); + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().context_synchronize(); #else QD_ERROR("No CUDA support"); #endif } else if (config_.arch == Arch::amdgpu) { #if defined(QD_WITH_AMDGPU) - auto *active_stream = AMDGPUContext::get_instance().get_stream(); - if (active_stream != nullptr) { - AMDGPUDriver::get_instance().stream_synchronize(active_stream); - } - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUContext::get_instance().make_current(); + AMDGPUDriver::get_instance().device_synchronize(); #else QD_ERROR("No AMDGPU support"); #endif From 3a81a46abcd5a53eea40df89e7283b4516479667 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 08:32:15 -0700 Subject: [PATCH 22/28] Use synchronous mem_free in dealloc_memory pool branch mem_free_async on the NULL stream does not sync with non-blocking user streams, so a Python ndarray dropped while a kernel is still in flight could return its slab to the mempool prematurely. Using synchronous mem_free matches pre-stream-rewire behavior and implicitly waits for all pending work on the device. Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp index d127ce19a0..280cd9f7e1 100644 --- a/quadrants/rhi/amdgpu/amdgpu_device.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp @@ -125,7 +125,7 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) { } QD_ASSERT(!info.is_imported); if (info.use_memory_pool) { - AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr); + AMDGPUDriver::get_instance().mem_free(info.ptr); } else if (info.use_cached) { DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/) .release(info.size, (uint64_t *)info.ptr, false); From 3499bbcccef6f174cbc15649b0dcbd00eaf5c990 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 09:04:44 -0700 Subject: [PATCH 23/28] Thread active_stream through AMDGPU profiler event_record and sync Profiler events were hardcoded to the NULL stream while kernels now run on user streams; with HIP_STREAM_NON_BLOCKING both events signal immediately on the empty NULL stream, yielding ~0 ms timings. Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_profiler.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp index 731d536bca..e963f7df20 100644 --- a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp @@ -59,8 +59,9 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, } void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { - AMDGPUDriver::get_instance().event_record(handle, 0); - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + void *active_stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().event_record(handle, active_stream); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); // get elapsed time and destroy events auto record = event_toolkit_->get_current_event_record(); @@ -154,7 +155,8 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std:: AMDGPUDriver::get_instance().event_create(&(record.start_event), HIP_EVENT_DEFAULT); AMDGPUDriver::get_instance().event_create(&(record.stop_event), HIP_EVENT_DEFAULT); - AMDGPUDriver::get_instance().event_record((record.start_event), 0); + void *active_stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().event_record((record.start_event), active_stream); event_records_.push_back(record); if (!base_event_) { @@ -163,7 +165,7 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std:: for (int i = 0; i < n_iters; i++) { void *e; AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT); - AMDGPUDriver::get_instance().event_record(e, 0); + AMDGPUDriver::get_instance().event_record(e, active_stream); AMDGPUDriver::get_instance().event_synchronize(e); auto final_t = Time::get_time(); if (i == n_iters - 1) { From 6e49c52d13f426dcac3c14b5b839059db2cb5839 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 03:12:15 -0700 Subject: [PATCH 24/28] Restore context_pointer free comment in AMDGPU kernel launcher The comment explains a non-obvious race condition: context_pointer must be freed directly (now via mem_free_async on active_stream) rather than through AMDGPUContext's deferred free list, because that list is drained by LlvmRuntimeExecutor::synchronize which can be called mid-launch. Co-authored-by: Cursor --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index ab34003cbd..42db3934dd 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -288,6 +288,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx } else if (ctx.result_buffer_size > 0) { AMDGPUDriver::get_instance().stream_synchronize(active_stream); } + // Free the per-launch `RuntimeContext` on the active stream rather than through `AMDGPUContext`'s deferred free + // list. The deferred list is drained by `LlvmRuntimeExecutor::synchronize`, which is also called from + // `fetch_result_uint64` during `ensure_adstack_heap`'s field-pointer query -- that path would free + // `context_pointer` mid-launch, and HIP could recycle the address for the adstack heap allocated right after, + // clobbering the `RuntimeContext` the next task still reads from. AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream); } From 1c81322cbe0e418a6deaa765d877a505d29ced16 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 05:29:18 -0700 Subject: [PATCH 25/28] Fix clang-format in program_stream.h Co-authored-by: Cursor --- quadrants/program/program_stream.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h index ae6b7221d5..54a8e88d0b 100644 --- a/quadrants/program/program_stream.h +++ b/quadrants/program/program_stream.h @@ -11,7 +11,8 @@ namespace quadrants::lang { class StreamManager { public: - explicit StreamManager(Arch arch) : arch_(arch) {} + explicit StreamManager(Arch arch) : arch_(arch) { + } uint64 create_stream(); void destroy_stream(uint64 stream_handle); From d3317f5cf00e4955095edefdeab68227426243c5 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 06:01:11 -0700 Subject: [PATCH 26/28] Fix AMDGPU branches in StreamManager: use arch_ member instead of compile_config() The base branch refactored stream/event methods from Program:: to StreamManager::, which stores the arch in arch_. Our AMDGPU branches still referenced compile_config().arch which is a Program method. Co-authored-by: Cursor --- quadrants/program/program_stream.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp index 88288cc313..b4adc0226a 100644 --- a/quadrants/program/program_stream.cpp +++ b/quadrants/program/program_stream.cpp @@ -28,7 +28,7 @@ uint64 StreamManager::create_stream() { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); void *stream = nullptr; AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/); @@ -46,7 +46,7 @@ void StreamManager::destroy_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + if (arch_ == Arch::amdgpu && stream_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } @@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + if (arch_ == Arch::amdgpu && stream_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } @@ -76,7 +76,7 @@ void StreamManager::set_current_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); AMDGPUContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } @@ -93,7 +93,7 @@ uint64 StreamManager::create_event() { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); void *event = nullptr; AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/); @@ -111,7 +111,7 @@ void StreamManager::destroy_event(uint64 event_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } @@ -127,7 +127,7 @@ void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_record(reinterpret_cast(event_handle), reinterpret_cast(stream_handle)); @@ -143,7 +143,7 @@ void StreamManager::synchronize_event(uint64 event_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } @@ -159,7 +159,7 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle) } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), reinterpret_cast(event_handle), 0 /*flags*/); From b4450f7c1837e3fb603ddf267fb0a01a8f781154 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 10:19:23 -0700 Subject: [PATCH 27/28] Fix clang-format in export_stream.cpp Co-authored-by: Cursor --- quadrants/python/export_stream.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp index f3f2fad525..66b3c8a3d7 100644 --- a/quadrants/python/export_stream.cpp +++ b/quadrants/python/export_stream.cpp @@ -10,8 +10,7 @@ namespace quadrants { void export_stream(py::module &m, py::class_ &program_class) { using lang::Program; - program_class - .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) + program_class.def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); }) .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); }) .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); }) From e8d9cf0413588ddfd1c51967407d53d8c657136e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 12:18:08 -0700 Subject: [PATCH 28/28] Allow synchronizing the default AMDGPU stream (handle 0) The stream_handle != 0 guard made synchronize_stream a no-op for the default stream on AMDGPU, unlike the CUDA path. HIP supports hipStreamSynchronize(nullptr), so remove the guard to match CUDA semantics. Co-authored-by: Cursor --- quadrants/program/program_stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp index 31fb12e76d..9686a86332 100644 --- a/quadrants/program/program_stream.cpp +++ b/quadrants/program/program_stream.cpp @@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (arch_ == Arch::amdgpu && stream_handle != 0) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); }