diff --git a/quadrants/runtime/amdgpu/device_scratch_buffer.h b/quadrants/runtime/amdgpu/device_scratch_buffer.h new file mode 100644 index 0000000000..ce8fbb5147 --- /dev/null +++ b/quadrants/runtime/amdgpu/device_scratch_buffer.h @@ -0,0 +1,181 @@ +#pragma once + +#include +#include + +#include "quadrants/rhi/amdgpu/amdgpu_driver.h" + +namespace quadrants::lang { +namespace amdgpu { + +// RAII owner of a device-side scratch buffer allocated via hipMallocAsync. +// +// Motivation +// ---------- +// hipMallocAsync / hipFreeAsync on ROCm are NOT as cheap as the +// stream-ordered pool API is meant to be. Even on a fully-cached pool hit +// (i.e. when no actual device-side allocation work is required), each call +// still pays: +// +// * a mutex acquire on the pool data structure, +// * stream-ordering metadata bookkeeping in the CLR (rocclr) layer, +// * a hop into the runtime's worker thread for HSA-level scheduling. +// +// We cannot eliminate that cost from inside the driver. We have already +// raised HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD (see amdgpu_context.cpp) so +// that the pool keeps freed memory cached and never round-trips to the OS, +// but that knob only addresses the "actual allocation" branch, not the +// per-call overhead above. Empirically, eliminating the per-launch +// hipMallocAsync / hipFreeAsync calls in the kernel launcher hot path +// yields a uniform throughput improvement; nothing at the driver/pool +// layer can substitute for it. +// +// On NVIDIA the equivalent cuMemAllocAsync is fast enough that the CUDA +// backend doesn't bother caching. This is a ROCm-specific workaround at +// the application layer for a ROCm-specific cost. +// +// This class encapsulates the workaround: each kernel handle owns one of +// these per scratch buffer (arg buffer, result buffer), allocates it +// lazily on first use, reuses it across launches, grows it on demand, and +// frees it deterministically when the owning launcher is destroyed. +// +// Semantics +// --------- +// - Allocates lazily on the first ensure() that requires a non-zero size. +// - Grows monotonically: ensure(N) only re-allocates if N > current capacity; +// the buffer is never shrunk. +// - Frees its allocation in the destructor (and on grow, when discarding the +// smaller backing buffer). +// - Move-only. Copying would alias the same device pointer with two owners +// and double-free in the destructor. +// +// Stream affinity +// --------------- +// Each DeviceScratchBuffer is bound to one HIP stream for the lifetime +// of its allocation: the constructor accepts an optional `stream` +// argument, which is used for every malloc_async / mem_free_async issued +// by ensure() and the destructor. Defaults to nullptr, i.e. the default +// (NULL) stream, which is what the launcher uses today. +// +// Single-stream affinity is intentional. Stream-ordered allocators are +// safe to free on a different stream than they were allocated on, but +// only if the caller threads the appropriate event dependencies; tying +// the buffer to one stream makes the grow-cycle (mem_free_async old, +// malloc_async new) trivially ordered and lets the destructor free +// without any extra synchronisation. Callers that want a buffer on a +// different stream should construct a separate DeviceScratchBuffer with +// that stream rather than mutating an existing one. +// +// Reuse across back-to-back launches on the same stream is safe because +// the stream serialises H2D, kernel, D2H, and any subsequent grow-cycle +// free in the order they are enqueued. Callers performing async H2D +// into the buffer followed by a kernel that reads it (the typical +// kernel-launcher path) get correct ordering for free. +// +// Thread-safety +// ------------- +// Not thread-safe. Concurrent ensure() calls on the same instance race on +// ptr_ / capacity_. Callers must serialise access. (Genesis launches kernels +// from a single host thread, so this is not a practical limitation.) +// +// Teardown safety +// --------------- +// In the normal flow the launcher (and therefore every DeviceScratchBuffer +// it owns) is destroyed during gs.destroy() while the AMDGPU context is +// still alive, so mem_free_async returns the memory to the pool cleanly. +// In the at-exit path, where the AMDGPU context may have already been torn +// down, the underlying driver call is a best-effort no-op; the OS reclaims +// device memory at process exit regardless. +class DeviceScratchBuffer { + public: + // Constructs an empty buffer (no allocation) bound to `stream`. The + // stream is opaque from this class's point of view (typed void* to + // match the AMDGPUDriver API surface, which mirrors hipStream_t). + // Pass nullptr (the default) for the default HIP stream, which is what + // the kernel launcher uses today; callers that introduce a non-default + // stream in the future can plumb it through here without touching this + // class. + explicit DeviceScratchBuffer(void *stream = nullptr) noexcept + : stream_(stream) {} + ~DeviceScratchBuffer() noexcept { release(); } + + DeviceScratchBuffer(const DeviceScratchBuffer &) = delete; + DeviceScratchBuffer &operator=(const DeviceScratchBuffer &) = delete; + + // Move preserves stream affinity: the destination operates on the same + // stream the source was bound to. The source is left in a valid empty + // state on its (unchanged) stream. + DeviceScratchBuffer(DeviceScratchBuffer &&other) noexcept + : stream_(other.stream_), + ptr_(other.ptr_), + capacity_(other.capacity_) { + other.ptr_ = nullptr; + other.capacity_ = 0; + } + + DeviceScratchBuffer &operator=(DeviceScratchBuffer &&other) noexcept { + if (this != &other) { + release(); + stream_ = other.stream_; + ptr_ = other.ptr_; + capacity_ = other.capacity_; + other.ptr_ = nullptr; + other.capacity_ = 0; + } + return *this; + } + + // Ensures the backing allocation is at least `min_bytes`, growing it + // on this buffer's bound stream if necessary. Returns the device + // pointer. + // + // The returned pointer is invalidated by any subsequent ensure() call + // that grows the buffer; callers must not retain it across launches. + char *ensure(std::size_t min_bytes) { + if (capacity_ < min_bytes) { + if (ptr_ != nullptr) { + AMDGPUDriver::get_instance().mem_free_async(ptr_, stream_); + ptr_ = nullptr; + } + AMDGPUDriver::get_instance().malloc_async( + reinterpret_cast(&ptr_), min_bytes, stream_); + capacity_ = min_bytes; + } + return ptr_; + } + + char *get() const noexcept { return ptr_; } + std::size_t capacity() const noexcept { return capacity_; } + void *stream() const noexcept { return stream_; } + + // Rebinds the buffer to a new stream. Only legal on an empty buffer + // (capacity == 0); rebinding while holding an allocation would risk + // freeing on a stream that doesn't have a valid happens-before edge + // to the buffer's last use. Callers that want to migrate a populated + // buffer to a different stream should release() first, set_stream(), + // then ensure() again. + void set_stream(void *stream) noexcept { + QD_ASSERT(capacity_ == 0); + stream_ = stream; + } + + // Eagerly releases the underlying allocation on this buffer's bound + // stream. Idempotent. Invoked automatically by the destructor; + // explicit invocation is rarely needed. + void release() noexcept { + if (ptr_ != nullptr) { + AMDGPUDriver::get_instance().mem_free_async(ptr_, stream_); + ptr_ = nullptr; + capacity_ = 0; + } + } + + private: + void *stream_{nullptr}; + char *ptr_{nullptr}; + std::size_t capacity_{0}; +}; + +} // namespace amdgpu +} // namespace quadrants::lang + diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 02b91f1310..e00985bc59 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -102,10 +102,14 @@ bool KernelLauncher::on_amdgpu_device(void *ptr) { return ret_code == HIP_SUCCESS && attr_val[0] == HIP_MEMORYTYPE_DEVICE; } +// Hot path. Uses cached per-handle device scratch buffers, async H2D/D2H, +// and lazy result-buffer materialisation to eliminate per-launch +// hipMallocAsync / hipFreeAsync overhead. See device_scratch_buffer.h +// for the full motivation. void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx) { QD_ASSERT(handle.get_launch_id() < contexts_.size()); - auto launcher_ctx = contexts_[handle.get_launch_id()]; + auto &launcher_ctx = contexts_[handle.get_launch_id()]; auto *executor = get_runtime_executor(); auto *amdgpu_module = launcher_ctx.jit_module; const auto ¶meters = *launcher_ctx.parameters; @@ -126,10 +130,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, transfers; std::unordered_map device_ptrs; - char *device_result_buffer{nullptr}; - AMDGPUDriver::get_instance().malloc_async( - (void **)&device_result_buffer, - std::max(ctx.result_buffer_size, sizeof(uint64)), nullptr); + char *device_result_buffer = nullptr; for (int i = 0; i < (int)parameters.size(); i++) { const auto &kv = parameters[i]; @@ -161,6 +162,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, branch_counts->kNone_host_copy.fetch_add( 1, std::memory_order_relaxed); } + device_result_buffer = launcher_ctx.device_result_buffer.ensure(sizeof(uint64)); DeviceAllocation devalloc = executor->allocate_memory_on_device( arr_sz, (uint64 *)device_result_buffer); device_ptrs[data_ptr_idx] = @@ -193,20 +195,19 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } } - if (transfers.size() > 0) { - AMDGPUDriver::get_instance().stream_synchronize(nullptr); - } + // No pre-kernel sync needed: transfer H2Ds above are synchronous, and + // the arg-buffer async H2D below is stream-ordered with the kernel. char *host_result_buffer = (char *)ctx.get_context().result_buffer; if (ctx.result_buffer_size > 0) { - // Malloc_Async and Free_Async are available after ROCm 5.4 + device_result_buffer = launcher_ctx.device_result_buffer.ensure(std::max(ctx.result_buffer_size, sizeof(uint64))); ctx.get_context().result_buffer = (uint64 *)device_result_buffer; } - char *device_arg_buffer = nullptr; if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().malloc_async((void **)&device_arg_buffer, - ctx.arg_buffer_size, nullptr); - AMDGPUDriver::get_instance().memcpy_host_to_device( - device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size); + char *device_arg_buffer = + launcher_ctx.device_arg_buffer.ensure(ctx.arg_buffer_size); + AMDGPUDriver::get_instance().memcpy_host_to_device_async( + device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size, + nullptr); ctx.get_context().arg_buffer = device_arg_buffer; } @@ -217,14 +218,18 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks); } QD_TRACE("Launching kernel"); - if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, nullptr); - } + bool needs_sync = false; if (ctx.result_buffer_size > 0) { - AMDGPUDriver::get_instance().memcpy_device_to_host( - host_result_buffer, device_result_buffer, ctx.result_buffer_size); + AMDGPUDriver::get_instance().memcpy_device_to_host_async( + host_result_buffer, device_result_buffer, ctx.result_buffer_size, + nullptr); + // Caller reads result_buffer via get_ret() with no further sync; + // hipMemcpyDtoHAsync may return before the host sees the copy, so + // force a sync below regardless of pageable/pinned destination. + needs_sync = true; } - if (transfers.size()) { + if (!transfers.empty() || needs_sync) { + AMDGPUDriver::get_instance().stream_synchronize(nullptr); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; auto arg_id = idx.arg_id; @@ -234,7 +239,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->deallocate_memory_on_device(itr->second.second); } } - AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, nullptr); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( @@ -264,3 +268,4 @@ KernelLauncher::Handle KernelLauncher::register_llvm_kernel( } // namespace amdgpu } // namespace quadrants::lang + diff --git a/quadrants/runtime/amdgpu/kernel_launcher.h b/quadrants/runtime/amdgpu/kernel_launcher.h index 23051c3ff8..cb5b2837cc 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.h +++ b/quadrants/runtime/amdgpu/kernel_launcher.h @@ -1,6 +1,7 @@ #pragma once #include "quadrants/codegen/llvm/compiled_kernel_data.h" +#include "quadrants/runtime/amdgpu/device_scratch_buffer.h" #include "quadrants/runtime/llvm/kernel_launcher.h" namespace quadrants::lang { @@ -9,10 +10,37 @@ namespace amdgpu { class KernelLauncher : public LLVM::KernelLauncher { using Base = LLVM::KernelLauncher; + // Per-handle launcher state. Each registered kernel handle owns one + // Context. Move-only because of the DeviceScratchBuffer members; the + // contexts_ vector relies on move semantics during resize. struct Context { JITModule *jit_module{nullptr}; - const std::vector> *parameters; + const std::vector> *parameters{ + nullptr}; std::vector offloaded_tasks; + + // Cached device scratch buffers, lazily sized on first launch and + // reused across subsequent launches. + // + // Why cache instead of calling hipMallocAsync/hipFreeAsync per launch? + // Because on ROCm those calls have non-trivial per-call overhead even + // when the memory pool is fully primed (mutex + stream-ordering + // bookkeeping in the CLR layer that no pool tuning can eliminate). + // See DeviceScratchBuffer's header comment for the full motivation. + // + // Stream affinity: default-constructed, i.e. bound to the default + // (NULL) HIP stream, which is what the launcher uses today. When a + // non-default stream is introduced (e.g. a per-handle stream set + // during register_llvm_kernel), construct these with that stream -- + // either via the constructor argument or via set_stream() before the + // first launch -- and update the memcpy_*_async / kernel-launch + // sites in launch_llvm_kernel to use the same stream. The class + // itself needs no further changes. + // + // RAII-managed: freed automatically when this Context (and thus the + // launcher) is destroyed. + DeviceScratchBuffer device_arg_buffer; + DeviceScratchBuffer device_result_buffer; }; public: @@ -37,3 +65,4 @@ class KernelLauncher : public LLVM::KernelLauncher { } // namespace amdgpu } // namespace quadrants::lang +