Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
7bd18ca
Add AMDGPU/HIP stream support and async memory operations
hughperkins Mar 11, 2026
b133bd7
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins Mar 12, 2026
7555ec5
Move AMDGPU mem_free_async before transfers sync to match CUDA ordering
hughperkins Mar 12, 2026
c12d23e
Convert AMDGPU sync memcpy_host_to_device to async on active_stream
hughperkins Mar 12, 2026
1673a38
Document ROCm >= 5.4 requirement for hipMallocAsync/hipFreeAsync
hughperkins Mar 12, 2026
60d015b
Relax concurrency test threshold and log timings
hughperkins Mar 12, 2026
c4be4ff
Add handle==0 guard to AMDGPU stream_synchronize and make stream_ thr…
hughperkins Mar 12, 2026
b28e7c6
Revert "Relax concurrency test threshold and log timings"
hughperkins Mar 12, 2026
3970abc
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins Apr 20, 2026
31fffbf
Apply clang-format
hughperkins Apr 20, 2026
1056bb4
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins Apr 24, 2026
798f87a
Exclude flaky test_perf_dispatch_python from Metal and Vulkan
hughperkins Apr 24, 2026
22c5524
Merge origin/hp/streams-quadrantsic-1-cuda-streams, resolve conflict …
hughperkins Apr 24, 2026
f42d4eb
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins Apr 28, 2026
2238969
[Doc] Update streams doc with AMDGPU support
hughperkins Apr 28, 2026
228150a
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins Apr 28, 2026
e368b4d
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins Apr 28, 2026
958c247
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins Apr 28, 2026
aff950d
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins May 1, 2026
84715de
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
8efd51f
Address review comments: fix AMDGPU stream issues
hughperkins May 1, 2026
34e9fa6
Use HIP_STREAM_NON_BLOCKING for AMDGPU stream_create to mirror CUDA path
hughperkins May 1, 2026
675542a
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
162239e
Use active stream for AMDGPU adstack metadata copies in publish_adsta…
hughperkins May 1, 2026
9334efd
Add make_current() to all AMDGPU stream/event Program methods
hughperkins May 1, 2026
c7eed44
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
1fba4f5
Use async DtoH on active_stream for AMDGPU resolve_num_threads readback
hughperkins May 1, 2026
0af8e19
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
f89bde0
Sync active_stream unconditionally at end of AMDGPU launch_llvm_kernel
hughperkins May 1, 2026
ef3b95b
Use async DtoH on active_stream for sizer stride readback
hughperkins May 1, 2026
64a389d
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
7f0f299
Fix end-of-launcher sync: conditional + dealloc race on AMDGPU
hughperkins May 1, 2026
5e8d198
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
84806cf
Fix NULL-stream DtoH races in synchronize() and allocate_llvm_runtime…
hughperkins May 1, 2026
05dcb4d
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
ae1c932
Reflow comments and docstring to 120-char line width
hughperkins May 1, 2026
3ef0340
Use context/device synchronize in synchronize() to drain all streams
hughperkins May 1, 2026
3a81a46
Use synchronous mem_free in dealloc_memory pool branch
hughperkins May 1, 2026
02ac865
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
3499bbc
Thread active_stream through AMDGPU profiler event_record and sync
hughperkins May 1, 2026
ce2fc6b
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
117a71f
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 1, 2026
8f71c91
Merge base branch: drop autodiff stream changes per new policy
hughperkins May 2, 2026
b030e4c
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
6e49c52
Restore context_pointer free comment in AMDGPU kernel launcher
hughperkins May 2, 2026
176e7d3
Merge base branch: add AMDGPU support to extracted program_stream.cpp
hughperkins May 2, 2026
c1562f2
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
1c81322
Fix clang-format in program_stream.h
hughperkins May 2, 2026
91fae3f
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
d3317f5
Fix AMDGPU branches in StreamManager: use arch_ member instead of com…
hughperkins May 2, 2026
33f2a04
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins May 2, 2026
b7eb63a
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
52a3be1
Merge branch 'hp/streams-quadrantsic-2-amdgpu-cpu' of github.com:Gene…
hughperkins May 2, 2026
4cef21b
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
4711160
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
b4450f7
Fix clang-format in export_stream.cpp
hughperkins May 2, 2026
93cd166
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
e8d9cf0
Allow synchronizing the default AMDGPU stream (handle 0)
hughperkins May 2, 2026
3f5a868
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
392b19a
Merge remote-tracking branch 'origin/hp/streams-quadrantsic-1-cuda-st…
hughperkins May 2, 2026
f67e7fd
Merge branch 'hp/streams-quadrantsic-1-cuda-streams' into hp/streams-…
hughperkins May 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/user_guide/streams.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Streams allow concurrent execution of GPU operations. By default, all Quadrants
| Backend | Streams | Events | Notes |
|---------|---------|--------|-------|
| CUDA | Yes | Yes | Full concurrent execution |
| AMDGPU | Yes | Yes | Full concurrent execution (requires ROCm >= 5.4) |
| CPU | No-op | No-op | `qd_stream` is silently ignored, kernels run serially |
| Metal | No-op | No-op | `qd_stream` is silently ignored, kernels run serially |
| Vulkan | No-op | No-op | `qd_stream` is silently ignored, kernels run serially |
Expand Down
65 changes: 65 additions & 0 deletions quadrants/program/program_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
#include "quadrants/rhi/cuda/cuda_context.h"
#endif

#ifdef QD_WITH_AMDGPU
#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
#include "quadrants/rhi/amdgpu/amdgpu_context.h"
#endif

namespace quadrants::lang {

// ---------------------------------------------------------------------------
Expand All @@ -21,6 +26,14 @@ uint64 StreamManager::create_stream() {
CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
return reinterpret_cast<uint64>(stream);
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu) {
AMDGPUContext::get_instance().make_current();
void *stream = nullptr;
AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
return reinterpret_cast<uint64>(stream);
}
#endif
return 0;
}
Expand All @@ -32,6 +45,12 @@ void StreamManager::destroy_stream(uint64 stream_handle) {
CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu && stream_handle != 0) {
AMDGPUContext::get_instance().make_current();
AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
}
#endif
}

void StreamManager::synchronize_stream(uint64 stream_handle) {
Expand All @@ -41,6 +60,12 @@ void StreamManager::synchronize_stream(uint64 stream_handle) {
CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu) {
AMDGPUContext::get_instance().make_current();
AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
}
#endif
}

void StreamManager::set_current_stream(uint64 stream_handle) {
Expand All @@ -50,6 +75,12 @@ void StreamManager::set_current_stream(uint64 stream_handle) {
CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu) {
AMDGPUContext::get_instance().make_current();
AMDGPUContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
}
#endif
}

uint64 StreamManager::create_event() {
Expand All @@ -60,6 +91,14 @@ uint64 StreamManager::create_event() {
CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
return reinterpret_cast<uint64>(event);
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu) {
AMDGPUContext::get_instance().make_current();
void *event = nullptr;
AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/);
return reinterpret_cast<uint64>(event);
}
#endif
return 0;
}
Expand All @@ -71,6 +110,12 @@ void StreamManager::destroy_event(uint64 event_handle) {
CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu && event_handle != 0) {
AMDGPUContext::get_instance().make_current();
AMDGPUDriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
}
#endif
}

void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) {
Expand All @@ -81,6 +126,13 @@ void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) {
reinterpret_cast<void *>(stream_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu && event_handle != 0) {
AMDGPUContext::get_instance().make_current();
AMDGPUDriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
reinterpret_cast<void *>(stream_handle));
}
#endif
}

void StreamManager::synchronize_event(uint64 event_handle) {
Expand All @@ -90,6 +142,12 @@ void StreamManager::synchronize_event(uint64 event_handle) {
CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu && event_handle != 0) {
AMDGPUContext::get_instance().make_current();
AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
}
#endif
}

void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
Expand All @@ -100,6 +158,13 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle)
reinterpret_cast<void *>(event_handle), 0 /*flags*/);
}
#endif
#ifdef QD_WITH_AMDGPU
if (arch_ == Arch::amdgpu && event_handle != 0) {
AMDGPUContext::get_instance().make_current();
AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
reinterpret_cast<void *>(event_handle), 0 /*flags*/);
}
#endif
}

} // namespace quadrants::lang
6 changes: 4 additions & 2 deletions quadrants/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
namespace quadrants {
namespace lang {

thread_local void *AMDGPUContext::stream_ = nullptr;

AMDGPUContext::AMDGPUContext() : driver_(AMDGPUDriver::get_instance_without_context()) {
dev_count_ = 0;
driver_.init(0);
Expand Down Expand Up @@ -190,7 +192,7 @@ void AMDGPUContext::launch(void *func,
if (grid_dim > 0) {
std::lock_guard<std::mutex> _(lock_);
void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02, (void *)&pack_size, (void *)0x03};
driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1, dynamic_shared_mem_bytes, nullptr, nullptr,
driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1, dynamic_shared_mem_bytes, stream_, nullptr,
Comment thread
claude[bot] marked this conversation as resolved.
reinterpret_cast<void **>(&config));
Comment thread
claude[bot] marked this conversation as resolved.
Comment thread
claude[bot] marked this conversation as resolved.
Comment thread
claude[bot] marked this conversation as resolved.
Comment thread
claude[bot] marked this conversation as resolved.
}
std::free(packed_arg);
Expand All @@ -199,7 +201,7 @@ void AMDGPUContext::launch(void *func,
profiler_->stop(task_handle);

if (debug_) {
driver_.stream_synchronize(nullptr);
driver_.stream_synchronize(stream_);
}
}

Expand Down
9 changes: 9 additions & 0 deletions quadrants/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class AMDGPUContext {
AMDGPUDriver &driver_;
bool debug_{false};
bool supports_mem_pool_{false};
static thread_local void *stream_;

public:
AMDGPUContext();
Expand Down Expand Up @@ -113,6 +114,14 @@ class AMDGPUContext {
return std::unique_lock<std::mutex>(lock_);
}

void set_stream(void *stream) {
stream_ = stream;
}

void *get_stream() const {
return stream_;
}

static AMDGPUContext &get_instance();
};

Expand Down
8 changes: 5 additions & 3 deletions quadrants/rhi/amdgpu/amdgpu_device.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "quadrants/rhi/amdgpu/amdgpu_device.h"
#include "quadrants/rhi/amdgpu/amdgpu_context.h"
#include "quadrants/rhi/llvm/device_memory_pool.h"

#include "quadrants/jit/jit_module.h"
Expand Down Expand Up @@ -93,11 +94,12 @@ uint64_t *AmdgpuDevice::allocate_llvm_runtime_memory_jit(const LlvmRuntimeAllocP
// the kernel without writing to *result. To detect that here, zero the slot first so a null readback unambiguously
// means "allocation failed" and we can surface a helpful host-side message instead of letting the downstream
// hipMemset trip on the stale pointer with a cryptic hipErrorInvalidValue.
void *active_stream = AMDGPUContext::get_instance().get_stream();
uint64 zero = 0;
AMDGPUDriver::get_instance().memcpy_host_to_device(params.result_buffer, &zero, sizeof(uint64));
AMDGPUDriver::get_instance().memcpy_host_to_device_async(params.result_buffer, &zero, sizeof(uint64), active_stream);
params.runtime_jit->call<void *, std::size_t, std::size_t>("runtime_memory_allocate_aligned", params.runtime,
params.size, quadrants_page_size, params.result_buffer);
AMDGPUDriver::get_instance().stream_synchronize(nullptr);
AMDGPUDriver::get_instance().stream_synchronize(active_stream);
uint64 *ret{nullptr};
AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, params.result_buffer, sizeof(uint64));
QD_ERROR_IF(ret == nullptr,
Expand All @@ -123,7 +125,7 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) {
}
QD_ASSERT(!info.is_imported);
if (info.use_memory_pool) {
AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr);
AMDGPUDriver::get_instance().mem_free(info.ptr);
} else if (info.use_cached) {
DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/)
.release(info.size, (uint64_t *)info.ptr, false);
Expand Down
9 changes: 8 additions & 1 deletion quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@ PER_AMDGPU_FUNCTION(context_create, hipCtxCreate, void *, int, void *);
PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *);
PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);

// Device synchronization
PER_AMDGPU_FUNCTION(device_synchronize, hipDeviceSynchronize);

// Stream management
PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32);
PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32);
PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);

// Memory management
PER_AMDGPU_FUNCTION(memcpy_host_to_device, hipMemcpyHtoD, void *, void *, std::size_t);
Expand All @@ -27,6 +31,8 @@ PER_AMDGPU_FUNCTION(memcpy_async, hipMemcpyAsync, void *, void *, std::size_t, u
PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, void *, std::size_t, void *);
PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *);
PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers fall back to the synchronous variants
// on devices without memory-pool support.
PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *);
PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32);
PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
Expand Down Expand Up @@ -61,6 +67,7 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy, hipOccupancyMaxActiveBlocksPerMultipro

// Stream management
PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *);
PER_AMDGPU_FUNCTION(stream_wait_event, hipStreamWaitEvent, void *, void *, uint32);

// Event management
PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32);
Expand Down
10 changes: 6 additions & 4 deletions quadrants/rhi/amdgpu/amdgpu_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
}

void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
AMDGPUDriver::get_instance().event_record(handle, 0);
AMDGPUDriver::get_instance().stream_synchronize(nullptr);
void *active_stream = AMDGPUContext::get_instance().get_stream();
AMDGPUDriver::get_instance().event_record(handle, active_stream);
AMDGPUDriver::get_instance().stream_synchronize(active_stream);

// get elapsed time and destroy events
auto record = event_toolkit_->get_current_event_record();
Expand Down Expand Up @@ -154,7 +155,8 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std::

AMDGPUDriver::get_instance().event_create(&(record.start_event), HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_create(&(record.stop_event), HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record((record.start_event), 0);
void *active_stream = AMDGPUContext::get_instance().get_stream();
AMDGPUDriver::get_instance().event_record((record.start_event), active_stream);
event_records_.push_back(record);

if (!base_event_) {
Expand All @@ -163,7 +165,7 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std::
for (int i = 0; i < n_iters; i++) {
void *e;
AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record(e, 0);
AMDGPUDriver::get_instance().event_record(e, active_stream);
AMDGPUDriver::get_instance().event_synchronize(e);
auto final_t = Time::get_time();
if (i == n_iters - 1) {
Expand Down
3 changes: 3 additions & 0 deletions quadrants/rhi/cuda/cuda_driver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ PER_CUDA_FUNCTION(kernel_get_occupancy, cuOccupancyMaxActiveBlocksPerMultiproces
PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_attribute_enum, int);


// Context management
PER_CUDA_FUNCTION(context_synchronize, cuCtxSynchronize);

// Stream management
PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *);
PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32);
Expand Down
Loading
Loading