ROCm · gpinkert · Apr 25, 2026 · Apr 24, 2026 · gpinkert · Apr 24, 2026
diff --git a/quadrants/runtime/amdgpu/device_scratch_buffer.h b/quadrants/runtime/amdgpu/device_scratch_buffer.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <cstddef>
+#include <utility>
+
+#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
+
+namespace quadrants::lang {
+namespace amdgpu {
+
+// RAII owner of a device-side scratch buffer allocated via hipMallocAsync.
+//
+// Motivation
+// ----------
+// hipMallocAsync / hipFreeAsync on ROCm are NOT as cheap as the
+// stream-ordered pool API is meant to be. Even on a fully-cached pool hit
+// (i.e. when no actual device-side allocation work is required), each call
+// still pays:
+//
+//   * a mutex acquire on the pool data structure,
+//   * stream-ordering metadata bookkeeping in the CLR (rocclr) layer,
+//   * a hop into the runtime's worker thread for HSA-level scheduling.
+//
+// We cannot eliminate that cost from inside the driver. We have already
+// raised HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD (see amdgpu_context.cpp) so
+// that the pool keeps freed memory cached and never round-trips to the OS,
+// but that knob only addresses the "actual allocation" branch, not the
+// per-call overhead above. Empirically, eliminating the per-launch
+// hipMallocAsync / hipFreeAsync calls in the kernel launcher hot path
+// yields a uniform throughput improvement; nothing at the driver/pool
+// layer can substitute for it.
+//
+// On NVIDIA the equivalent cuMemAllocAsync is fast enough that the CUDA
+// backend doesn't bother caching. This is a ROCm-specific workaround at
+// the application layer for a ROCm-specific cost.
+//
+// This class encapsulates the workaround: each kernel handle owns one of
+// these per scratch buffer (arg buffer, result buffer), allocates it
+// lazily on first use, reuses it across launches, grows it on demand, and
+// frees it deterministically when the owning launcher is destroyed.
+//
+// Semantics
+// ---------
+// - Allocates lazily on the first ensure() that requires a non-zero size.
+// - Grows monotonically: ensure(N) only re-allocates if N > current capacity;
+//   the buffer is never shrunk.
+// - Frees its allocation in the destructor (and on grow, when discarding the
+//   smaller backing buffer).
+// - Move-only. Copying would alias the same device pointer with two owners
+//   and double-free in the destructor.
+//
+// Stream affinity
+// ---------------
+// Each DeviceScratchBuffer is bound to one HIP stream for the lifetime
+// of its allocation: the constructor accepts an optional `stream`
+// argument, which is used for every malloc_async / mem_free_async issued
+// by ensure() and the destructor. Defaults to nullptr, i.e. the default
+// (NULL) stream, which is what the launcher uses today.
+//
+// Single-stream affinity is intentional. Stream-ordered allocators are
+// safe to free on a different stream than they were allocated on, but
+// only if the caller threads the appropriate event dependencies; tying
+// the buffer to one stream makes the grow-cycle (mem_free_async old,
+// malloc_async new) trivially ordered and lets the destructor free
+// without any extra synchronisation. Callers that want a buffer on a
+// different stream should construct a separate DeviceScratchBuffer with
+// that stream rather than mutating an existing one.
+//
+// Reuse across back-to-back launches on the same stream is safe because
+// the stream serialises H2D, kernel, D2H, and any subsequent grow-cycle
+// free in the order they are enqueued. Callers performing async H2D
+// into the buffer followed by a kernel that reads it (the typical
+// kernel-launcher path) get correct ordering for free.
+//
+// Thread-safety
+// -------------
+// Not thread-safe. Concurrent ensure() calls on the same instance race on
+// ptr_ / capacity_. Callers must serialise access. (Genesis launches kernels
+// from a single host thread, so this is not a practical limitation.)
+//
+// Teardown safety
+// ---------------
+// In the normal flow the launcher (and therefore every DeviceScratchBuffer
+// it owns) is destroyed during gs.destroy() while the AMDGPU context is
+// still alive, so mem_free_async returns the memory to the pool cleanly.
+// In the at-exit path, where the AMDGPU context may have already been torn
+// down, the underlying driver call is a best-effort no-op; the OS reclaims
+// device memory at process exit regardless.
+class DeviceScratchBuffer {
+ public:
+  // Constructs an empty buffer (no allocation) bound to `stream`. The
+  // stream is opaque from this class's point of view (typed void* to
+  // match the AMDGPUDriver API surface, which mirrors hipStream_t).
+  // Pass nullptr (the default) for the default HIP stream, which is what
+  // the kernel launcher uses today; callers that introduce a non-default
+  // stream in the future can plumb it through here without touching this
+  // class.
+  explicit DeviceScratchBuffer(void *stream = nullptr) noexcept
+      : stream_(stream) {}
+  ~DeviceScratchBuffer() noexcept { release(); }
+
+  DeviceScratchBuffer(const DeviceScratchBuffer &) = delete;
+  DeviceScratchBuffer &operator=(const DeviceScratchBuffer &) = delete;
+
+  // Move preserves stream affinity: the destination operates on the same
+  // stream the source was bound to. The source is left in a valid empty
+  // state on its (unchanged) stream.
+  DeviceScratchBuffer(DeviceScratchBuffer &&other) noexcept
+      : stream_(other.stream_),
+        ptr_(other.ptr_),
+        capacity_(other.capacity_) {
+    other.ptr_ = nullptr;
+    other.capacity_ = 0;
+  }
+
+  DeviceScratchBuffer &operator=(DeviceScratchBuffer &&other) noexcept {
+    if (this != &other) {
+      release();
+      stream_ = other.stream_;
+      ptr_ = other.ptr_;
+      capacity_ = other.capacity_;
+      other.ptr_ = nullptr;
+      other.capacity_ = 0;
+    }
+    return *this;
+  }
+
+  // Ensures the backing allocation is at least `min_bytes`, growing it
+  // on this buffer's bound stream if necessary. Returns the device
+  // pointer.
+  //
+  // The returned pointer is invalidated by any subsequent ensure() call
+  // that grows the buffer; callers must not retain it across launches.
+  char *ensure(std::size_t min_bytes) {
+    if (capacity_ < min_bytes) {
+      if (ptr_ != nullptr) {
+        AMDGPUDriver::get_instance().mem_free_async(ptr_, stream_);
+        ptr_ = nullptr;
+      }
+      AMDGPUDriver::get_instance().malloc_async(
+          reinterpret_cast<void **>(&ptr_), min_bytes, stream_);
+      capacity_ = min_bytes;
+    }
+    return ptr_;
+  }
+
+  char *get() const noexcept { return ptr_; }
+  std::size_t capacity() const noexcept { return capacity_; }
+  void *stream() const noexcept { return stream_; }
+
+  // Rebinds the buffer to a new stream. Only legal on an empty buffer
+  // (capacity == 0); rebinding while holding an allocation would risk
+  // freeing on a stream that doesn't have a valid happens-before edge
+  // to the buffer's last use. Callers that want to migrate a populated
+  // buffer to a different stream should release() first, set_stream(),
+  // then ensure() again.
+  void set_stream(void *stream) noexcept {
+    QD_ASSERT(capacity_ == 0);
+    stream_ = stream;
+  }
+
+  // Eagerly releases the underlying allocation on this buffer's bound
+  // stream. Idempotent. Invoked automatically by the destructor;
+  // explicit invocation is rarely needed.
+  void release() noexcept {
+    if (ptr_ != nullptr) {
+      AMDGPUDriver::get_instance().mem_free_async(ptr_, stream_);
+      ptr_ = nullptr;
+      capacity_ = 0;
+    }
+  }
+
+ private:
+  void *stream_{nullptr};
+  char *ptr_{nullptr};
+  std::size_t capacity_{0};
+};
+
+}  // namespace amdgpu
+}  // namespace quadrants::lang
+
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -102,10 +102,14 @@ bool KernelLauncher::on_amdgpu_device(void *ptr) {
   return ret_code == HIP_SUCCESS && attr_val[0] == HIP_MEMORYTYPE_DEVICE;
 }
 
+// Hot path. Uses cached per-handle device scratch buffers, async H2D/D2H,
+// and lazy result-buffer materialisation to eliminate per-launch
+// hipMallocAsync / hipFreeAsync overhead. See device_scratch_buffer.h
+// for the full motivation.
 void KernelLauncher::launch_llvm_kernel(Handle handle,
                                         LaunchContextBuilder &ctx) {
   QD_ASSERT(handle.get_launch_id() < contexts_.size());
-  auto launcher_ctx = contexts_[handle.get_launch_id()];
+  auto &launcher_ctx = contexts_[handle.get_launch_id()];
   auto *executor = get_runtime_executor();
   auto *amdgpu_module = launcher_ctx.jit_module;
   const auto &parameters = *launcher_ctx.parameters;
@@ -126,10 +130,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       transfers;
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
-  char *device_result_buffer{nullptr};
-  AMDGPUDriver::get_instance().malloc_async(
-      (void **)&device_result_buffer,
-      std::max(ctx.result_buffer_size, sizeof(uint64)), nullptr);
+  char *device_result_buffer = nullptr;
 
   for (int i = 0; i < (int)parameters.size(); i++) {
     const auto &kv = parameters[i];
@@ -161,6 +162,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
             branch_counts->kNone_host_copy.fetch_add(
                 1, std::memory_order_relaxed);
           }
+          device_result_buffer = launcher_ctx.device_result_buffer.ensure(sizeof(uint64));
           DeviceAllocation devalloc = executor->allocate_memory_on_device(
               arr_sz, (uint64 *)device_result_buffer);
           device_ptrs[data_ptr_idx] =
@@ -193,20 +195,19 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       }
     }
   }
-  if (transfers.size() > 0) {
-    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
-  }
+  // No pre-kernel sync needed: transfer H2Ds above are synchronous, and
+  // the arg-buffer async H2D below is stream-ordered with the kernel.
   char *host_result_buffer = (char *)ctx.get_context().result_buffer;
   if (ctx.result_buffer_size > 0) {
-    // Malloc_Async and Free_Async are available after ROCm 5.4
+    device_result_buffer = launcher_ctx.device_result_buffer.ensure(std::max(ctx.result_buffer_size, sizeof(uint64)));
     ctx.get_context().result_buffer = (uint64 *)device_result_buffer;
   }
-  char *device_arg_buffer = nullptr;
   if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().malloc_async((void **)&device_arg_buffer,
-                                              ctx.arg_buffer_size, nullptr);
-    AMDGPUDriver::get_instance().memcpy_host_to_device(
-        device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size);
+    char *device_arg_buffer =
+        launcher_ctx.device_arg_buffer.ensure(ctx.arg_buffer_size);
+    AMDGPUDriver::get_instance().memcpy_host_to_device_async(
+        device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size,
+        nullptr);
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
 
@@ -217,14 +218,18 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks);
   }
   QD_TRACE("Launching kernel");
-  if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, nullptr);
-  }
+  bool needs_sync = false;
   if (ctx.result_buffer_size > 0) {
-    AMDGPUDriver::get_instance().memcpy_device_to_host(
-        host_result_buffer, device_result_buffer, ctx.result_buffer_size);
+    AMDGPUDriver::get_instance().memcpy_device_to_host_async(
+        host_result_buffer, device_result_buffer, ctx.result_buffer_size,
+        nullptr);
+    // Caller reads result_buffer via get_ret() with no further sync;
+    // hipMemcpyDtoHAsync may return before the host sees the copy, so
+    // force a sync below regardless of pageable/pinned destination.
+    needs_sync = true;
   }
-  if (transfers.size()) {
+  if (!transfers.empty() || needs_sync) {
+    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
       auto arg_id = idx.arg_id;
@@ -234,7 +239,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
-  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, nullptr);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
@@ -264,3 +268,4 @@ KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
 
 }  // namespace amdgpu
 }  // namespace quadrants::lang
+
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.h b/quadrants/runtime/amdgpu/kernel_launcher.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "quadrants/codegen/llvm/compiled_kernel_data.h"
+#include "quadrants/runtime/amdgpu/device_scratch_buffer.h"
 #include "quadrants/runtime/llvm/kernel_launcher.h"
 
 namespace quadrants::lang {
@@ -9,10 +10,37 @@ namespace amdgpu {
 class KernelLauncher : public LLVM::KernelLauncher {
   using Base = LLVM::KernelLauncher;
 
+  // Per-handle launcher state. Each registered kernel handle owns one
+  // Context. Move-only because of the DeviceScratchBuffer members; the
+  // contexts_ vector relies on move semantics during resize.
   struct Context {
     JITModule *jit_module{nullptr};
-    const std::vector<std::pair<int, Callable::Parameter>> *parameters;
+    const std::vector<std::pair<int, Callable::Parameter>> *parameters{
+        nullptr};
     std::vector<OffloadedTask> offloaded_tasks;
+
+    // Cached device scratch buffers, lazily sized on first launch and
+    // reused across subsequent launches.
+    //
+    // Why cache instead of calling hipMallocAsync/hipFreeAsync per launch?
+    // Because on ROCm those calls have non-trivial per-call overhead even
+    // when the memory pool is fully primed (mutex + stream-ordering
+    // bookkeeping in the CLR layer that no pool tuning can eliminate).
+    // See DeviceScratchBuffer's header comment for the full motivation.
+    //
+    // Stream affinity: default-constructed, i.e. bound to the default
+    // (NULL) HIP stream, which is what the launcher uses today. When a
+    // non-default stream is introduced (e.g. a per-handle stream set
+    // during register_llvm_kernel), construct these with that stream --
+    // either via the constructor argument or via set_stream() before the
+    // first launch -- and update the memcpy_*_async / kernel-launch
+    // sites in launch_llvm_kernel to use the same stream. The class
+    // itself needs no further changes.
+    //
+    // RAII-managed: freed automatically when this Context (and thus the
+    // launcher) is destroyed.
+    DeviceScratchBuffer device_arg_buffer;
+    DeviceScratchBuffer device_result_buffer;
   };
 
  public:
@@ -37,3 +65,4 @@ class KernelLauncher : public LLVM::KernelLauncher {
 
 }  // namespace amdgpu
 }  // namespace quadrants::lang
+