ROCm · deepsek · Apr 17, 2026 · Apr 17, 2026
diff --git a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp
diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp
@@ -38,6 +38,8 @@ FunctionCreationGuard::FunctionCreationGuard(
                                 llvm::Function::InternalLinkage, func_name,
                                 mb->module.get());
   old_func = mb->func;
+  old_context_val_alloca = mb->context_val_alloca_;
+  mb->context_val_alloca_ = nullptr;
   // emit into loop body function
   mb->func = body;
 
@@ -74,6 +76,7 @@ FunctionCreationGuard::~FunctionCreationGuard() {
   mb->entry_block = old_entry;
   mb->final_block = old_final;
   mb->func = old_func;
+  mb->context_val_alloca_ = old_context_val_alloca;
   mb->builder->restoreIP(ip);
 
   QD_ASSERT(!llvm::verifyFunction(*body, &llvm::errs()));
@@ -2015,9 +2018,15 @@ std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt,
   current_loop_reentry = nullptr;
   current_while_after_loop = nullptr;
 
+  llvm::Type *context_param_type;
+  if (kernel_argument_struct_in_kernarg()) {
+    context_param_type = context_ty;
+  } else {
+    context_param_type = llvm::PointerType::get(context_ty, 0);
+  }
   task_function_type =
       llvm::FunctionType::get(llvm::Type::getVoidTy(*llvm_context),
-                              {llvm::PointerType::get(context_ty, 0)}, false);
+                              {context_param_type}, false);
 
   auto task_kernel_name = fmt::format(
       "{}_{}_{}{}", kernel_name, task_codegen_id, stmt->task_name(), suffix);
@@ -2041,6 +2050,13 @@ std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt,
   // The real function body
   func_body_bb = llvm::BasicBlock::Create(*llvm_context, "body", func);
   builder->SetInsertPoint(func_body_bb);
+
+  if (kernel_argument_struct_in_kernarg()) {
+    context_val_alloca_ = create_entry_block_alloca(context_ty);
+    builder->CreateStore(kernel_args[0], context_val_alloca_);
+  } else {
+    context_val_alloca_ = nullptr;
+  }
   return task_kernel_name;
 }
 
@@ -2627,6 +2643,8 @@ llvm::Value *TaskCodeGenLLVM::get_arg(int i) {
 }
 
 llvm::Value *TaskCodeGenLLVM::get_context() {
+  if (context_val_alloca_)
+    return context_val_alloca_;
   return get_arg(0);
 }
 
@@ -2732,7 +2750,7 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
     for (const auto &task : offloaded_tasks) {
       llvm::Function *func = module->getFunction(task.name);
       QD_ASSERT(func);
-      tlctx->mark_function_as_amdgpu_kernel(func);
+      tlctx->mark_function_as_amdgpu_kernel(func, task.block_dim);
     }
 #if defined(QD_WITH_AMDGPU)
     llvm::legacy::FunctionPassManager fpm(module.get());

diff --git a/quadrants/codegen/llvm/codegen_llvm.h b/quadrants/codegen/llvm/codegen_llvm.h
@@ -22,6 +22,7 @@ class FunctionCreationGuard {
   llvm::Function *body;
   llvm::BasicBlock *old_entry, *allocas, *entry, *old_final, *final;
   llvm::IRBuilder<>::InsertPoint ip;
+  llvm::Value *old_context_val_alloca{nullptr};
 
   FunctionCreationGuard(TaskCodeGenLLVM *mb,
                         std::vector<llvm::Type *> arguments,
@@ -97,6 +98,8 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   llvm::Value *get_tls_base_ptr();
 
+  llvm::Value *context_val_alloca_{nullptr};
+
   llvm::Type *get_tls_buffer_type();
 
   std::vector<llvm::Type *> get_xlogue_argument_types();
@@ -111,7 +114,7 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   llvm::Value *get_root(int snode_tree_id);
 
-  llvm::Value *get_runtime();
+  virtual llvm::Value *get_runtime();
 
   void emit_struct_meta_base(const std::string &name,
                              llvm::Value *node_meta,
@@ -330,6 +333,13 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
     return false;  // on CPU devices just pass in a pointer
   }
 
+  // On AMDGPU, byval attribute is disallowed by the calling convention
+  // verifier. Instead, pass the struct type directly in kernarg and store
+  // to an alloca at function entry to obtain a pointer.
+  virtual bool kernel_argument_struct_in_kernarg() const {
+    return false;
+  }
+
   std::string init_offloaded_task_function(OffloadedStmt *stmt,
                                            std::string suffix = "");
 

diff --git a/quadrants/program/compile_config.h b/quadrants/program/compile_config.h
@@ -102,6 +102,8 @@ struct CompileConfig {
 
   size_t cuda_stack_limit{0};
 
+  bool amdgpu_auto_waves_per_eu{true};
+
   CompileConfig();
 
   void fit();

diff --git a/quadrants/program/extension.cpp b/quadrants/program/extension.cpp
@@ -19,7 +19,7 @@ bool is_extension_supported(Arch arch, Extension ext) {
        {Extension::sparse, Extension::quant, Extension::quant_basic,
         Extension::data64, Extension::adstack, Extension::bls,
         Extension::assertion, Extension::mesh}},
-      {Arch::amdgpu, {Extension::assertion}},
+      {Arch::amdgpu, {Extension::assertion, Extension::bls}},
       {Arch::metal, {}},
       {Arch::vulkan, {}},
   };

diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
@@ -201,6 +201,8 @@ void export_lang(py::module &m) {
       .def_readwrite("default_gpu_block_dim",
                      &CompileConfig::default_gpu_block_dim)
       .def_readwrite("gpu_max_reg", &CompileConfig::gpu_max_reg)
+      .def_readwrite("amdgpu_auto_waves_per_eu",
+                     &CompileConfig::amdgpu_auto_waves_per_eu)
       .def_readwrite("saturating_grid_dim", &CompileConfig::saturating_grid_dim)
       .def_readwrite("max_block_dim", &CompileConfig::max_block_dim)
       .def_readwrite("cpu_max_num_threads", &CompileConfig::cpu_max_num_threads)

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp
@@ -84,6 +84,21 @@ AMDGPUContext::AMDGPUContext()
   mcpu_ = mcpu_.substr(0, mcpu_.find(":"));
   std::free(hip_device_prop);
 
+  if (driver_.device_get_default_mem_pool.is_available()) {
+    void *default_mem_pool = nullptr;
+    uint32 err = driver_.device_get_default_mem_pool.call_with_warning(
+        &default_mem_pool, 0);
+    if (err == HIP_SUCCESS && default_mem_pool != nullptr) {
+      supports_mem_pool_ = true;
+      constexpr uint64 kMemPoolReleaseThreshold = 1048576 * 128;
+      driver_.mem_pool_set_attribute(default_mem_pool,
+                                     HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                                     (void *)&kMemPoolReleaseThreshold);
+      QD_TRACE("HIP memory pool enabled (release threshold: {} bytes)",
+               kMemPoolReleaseThreshold);
+    }
+  }
+
   QD_TRACE("Emitting AMDGPU code for {}", mcpu_);
 }
 
@@ -175,7 +190,8 @@ void AMDGPUContext::launch(void *func,
     bool valid =
         offline_cache::try_demangle_name(task_name, primal_task_name, key);
     profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name,
-                           func, grid_dim, block_dim, 0);
+                           func, grid_dim, block_dim,
+                           dynamic_shared_mem_bytes);
   }
 
   auto context_guard = AMDGPUContext::get_instance().get_guard();

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -23,6 +23,7 @@ class AMDGPUContext {
   KernelProfilerBase *profiler_{nullptr};
   AMDGPUDriver &driver_;
   bool debug_{false};
+  bool supports_mem_pool_{false};
   std::vector<void *> kernel_arg_pointer_;
 
  public:
@@ -36,13 +37,21 @@ class AMDGPUContext {
     return dev_count_ != 0;
   }
 
+  bool supports_mem_pool() const {
+    return supports_mem_pool_;
+  }
+
   void push_back_kernel_arg_pointer(void *ptr) {
     kernel_arg_pointer_.push_back(ptr);
   }
 
   void free_kernel_arg_pointer() {
     for (auto &i : kernel_arg_pointer_) {
-      AMDGPUDriver::get_instance().mem_free(i);
+      if (supports_mem_pool_) {
+        AMDGPUDriver::get_instance().mem_free_async(i, nullptr);
+      } else {
+        AMDGPUDriver::get_instance().mem_free(i);
+      }
     }
     kernel_arg_pointer_.erase(kernel_arg_pointer_.begin(),
                               kernel_arg_pointer_.end());

diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp
@@ -56,19 +56,26 @@ DeviceAllocation AmdgpuDevice::allocate_memory_runtime(
     const LlvmRuntimeAllocParams &params) {
   AllocInfo info;
   info.size = quadrants::iroundup(params.size, quadrants_page_size);
-  if (params.host_read || params.host_write) {
+  if (info.size == 0) {
+    info.ptr = nullptr;
+  } else if (params.use_memory_pool) {
+    AMDGPUDriver::get_instance().malloc_async((void **)&info.ptr, info.size,
+                                              nullptr);
+  } else if (params.host_read || params.host_write) {
     QD_NOT_IMPLEMENTED
   } else {
     info.ptr = DeviceMemoryPool::get_instance(Arch::amdgpu,
                                               false /*merge_upon_release*/)
                    .allocate_with_cache(this, params);
-    QD_ASSERT(info.ptr != nullptr);
+  }
 
+  if (info.ptr)
     AMDGPUDriver::get_instance().memset((void *)info.ptr, 0, info.size);
-  }
+
   info.is_imported = false;
   info.use_cached = true;
   info.use_preallocated = true;
+  info.use_memory_pool = params.use_memory_pool;
 
   DeviceAllocation alloc;
   alloc.alloc_id = allocations_.size();
@@ -98,11 +105,17 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) {
 
   validate_device_alloc(handle);
   AllocInfo &info = allocations_[handle.alloc_id];
+
+  if (info.size == 0) {
+    return;
+  }
   if (info.ptr == nullptr) {
     QD_ERROR("the DeviceAllocation is already deallocated");
   }
   QD_ASSERT(!info.is_imported);
-  if (info.use_cached) {
+  if (info.use_memory_pool) {
+    AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr);
+  } else if (info.use_cached) {
     DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/)
         .release(info.size, (uint64_t *)info.ptr, false);
   } else if (!info.use_preallocated) {

diff --git a/quadrants/rhi/amdgpu/amdgpu_device.h b/quadrants/rhi/amdgpu/amdgpu_device.h
@@ -75,6 +75,7 @@ class AmdgpuDevice : public LlvmDevice {
     bool is_imported{false};
     bool use_preallocated{true};
     bool use_cached{false};
+    bool use_memory_pool{false};
     void *mapped{nullptr};
   };
 

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver.cpp b/quadrants/rhi/amdgpu/amdgpu_driver.cpp
@@ -79,5 +79,21 @@ AMDGPUDriver &AMDGPUDriver::get_instance() {
   return get_instance_without_context();
 }
 
+void AMDGPUDriver::malloc_async(void **dev_ptr, size_t size, void *stream) {
+  if (AMDGPUContext::get_instance().supports_mem_pool()) {
+    malloc_async_impl(dev_ptr, size, stream);
+  } else {
+    malloc(dev_ptr, size);
+  }
+}
+
+void AMDGPUDriver::mem_free_async(void *dev_ptr, void *stream) {
+  if (AMDGPUContext::get_instance().supports_mem_pool()) {
+    mem_free_async_impl(dev_ptr, stream);
+  } else {
+    mem_free(dev_ptr);
+  }
+}
+
 }  // namespace lang
 }  // namespace quadrants
diff --git a/quadrants/rhi/amdgpu/amdgpu_driver.h b/quadrants/rhi/amdgpu/amdgpu_driver.h
@@ -13,6 +13,7 @@ constexpr uint32 HIP_STREAM_NON_BLOCKING = 0x1;
 constexpr uint32 HIP_MEM_ATTACH_GLOBAL = 0x1;
 constexpr uint32 HIP_MEM_ADVISE_SET_PREFERRED_LOCATION = 3;
 constexpr uint32 HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 26;
+constexpr uint32 HIP_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39;
 constexpr uint32 HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 63;
 // sizeof(hipDeviceProperties_t) in ROCm 6.
 // ROCm 5.7.1 is 792 and ROCm 6 is 1472, so to make both work we use whichever
@@ -35,6 +36,7 @@ constexpr uint32 HIP_JIT_MAX_REGISTERS = 0;
 constexpr uint32 HIP_POINTER_ATTRIBUTE_MEMORY_TYPE = 2;
 constexpr uint32 HIP_SUCCESS = 0;
 constexpr uint32 HIP_MEMORYTYPE_DEVICE = 1;
+constexpr uint32 HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD = 4;
 
 std::string get_amdgpu_error_message(uint32 err);
 
@@ -70,6 +72,10 @@ class AMDGPUFunction {
            fmt::format(" while calling {} ({})", name_, symbol_name_);
   }
 
+  bool is_available() const {
+    return function_ != nullptr;
+  }
+
   uint32 call_with_warning(Args... args) {
     auto err = call(args...);
     QD_WARN_IF(err, "{}", get_error_message(err));
@@ -117,6 +123,10 @@ class AMDGPUDriver : protected AMDGPUDriverBase {
 
   void (*runtime_get_version)(int *);
 
+  void malloc_async(void **ptr, size_t size, void *stream);
+
+  void mem_free_async(void *ptr, void *stream);
+
   bool detected();
 
   static AMDGPUDriver &get_instance();

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -69,14 +69,25 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async,
                     std::size_t,
                     void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
+PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed,
                     hipMallocManaged,
                     void **,
                     std::size_t,
                     uint32);
 PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
 PER_AMDGPU_FUNCTION(mem_free, hipFree, void *);
+PER_AMDGPU_FUNCTION(mem_free_async_impl, hipFreeAsync, void *, void *);
 PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *);
+PER_AMDGPU_FUNCTION(device_get_default_mem_pool,
+                    hipDeviceGetDefaultMemPool,
+                    void **,
+                    int);
+PER_AMDGPU_FUNCTION(mem_pool_set_attribute,
+                    hipMemPoolSetAttribute,
+                    void *,
+                    uint32,
+                    void *);
 PER_AMDGPU_FUNCTION(mem_get_attribute,
                     hipPointerGetAttribute,
                     void *,