Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
324 changes: 192 additions & 132 deletions quadrants/codegen/amdgpu/codegen_amdgpu.cpp

Large diffs are not rendered by default.

22 changes: 20 additions & 2 deletions quadrants/codegen/llvm/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ FunctionCreationGuard::FunctionCreationGuard(
llvm::Function::InternalLinkage, func_name,
mb->module.get());
old_func = mb->func;
old_context_val_alloca = mb->context_val_alloca_;
mb->context_val_alloca_ = nullptr;
// emit into loop body function
mb->func = body;

Expand Down Expand Up @@ -74,6 +76,7 @@ FunctionCreationGuard::~FunctionCreationGuard() {
mb->entry_block = old_entry;
mb->final_block = old_final;
mb->func = old_func;
mb->context_val_alloca_ = old_context_val_alloca;
mb->builder->restoreIP(ip);

QD_ASSERT(!llvm::verifyFunction(*body, &llvm::errs()));
Expand Down Expand Up @@ -2015,9 +2018,15 @@ std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt,
current_loop_reentry = nullptr;
current_while_after_loop = nullptr;

llvm::Type *context_param_type;
if (kernel_argument_struct_in_kernarg()) {
context_param_type = context_ty;
} else {
context_param_type = llvm::PointerType::get(context_ty, 0);
}
task_function_type =
llvm::FunctionType::get(llvm::Type::getVoidTy(*llvm_context),
{llvm::PointerType::get(context_ty, 0)}, false);
{context_param_type}, false);

auto task_kernel_name = fmt::format(
"{}_{}_{}{}", kernel_name, task_codegen_id, stmt->task_name(), suffix);
Expand All @@ -2041,6 +2050,13 @@ std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt,
// The real function body
func_body_bb = llvm::BasicBlock::Create(*llvm_context, "body", func);
builder->SetInsertPoint(func_body_bb);

if (kernel_argument_struct_in_kernarg()) {
context_val_alloca_ = create_entry_block_alloca(context_ty);
builder->CreateStore(kernel_args[0], context_val_alloca_);
} else {
context_val_alloca_ = nullptr;
}
return task_kernel_name;
}

Expand Down Expand Up @@ -2627,6 +2643,8 @@ llvm::Value *TaskCodeGenLLVM::get_arg(int i) {
}

llvm::Value *TaskCodeGenLLVM::get_context() {
if (context_val_alloca_)
return context_val_alloca_;
return get_arg(0);
}

Expand Down Expand Up @@ -2732,7 +2750,7 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() {
for (const auto &task : offloaded_tasks) {
llvm::Function *func = module->getFunction(task.name);
QD_ASSERT(func);
tlctx->mark_function_as_amdgpu_kernel(func);
tlctx->mark_function_as_amdgpu_kernel(func, task.block_dim);
}
#if defined(QD_WITH_AMDGPU)
llvm::legacy::FunctionPassManager fpm(module.get());
Expand Down
12 changes: 11 additions & 1 deletion quadrants/codegen/llvm/codegen_llvm.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class FunctionCreationGuard {
llvm::Function *body;
llvm::BasicBlock *old_entry, *allocas, *entry, *old_final, *final;
llvm::IRBuilder<>::InsertPoint ip;
llvm::Value *old_context_val_alloca{nullptr};

FunctionCreationGuard(TaskCodeGenLLVM *mb,
std::vector<llvm::Type *> arguments,
Expand Down Expand Up @@ -97,6 +98,8 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {

llvm::Value *get_tls_base_ptr();

llvm::Value *context_val_alloca_{nullptr};

llvm::Type *get_tls_buffer_type();

std::vector<llvm::Type *> get_xlogue_argument_types();
Expand All @@ -111,7 +114,7 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {

llvm::Value *get_root(int snode_tree_id);

llvm::Value *get_runtime();
virtual llvm::Value *get_runtime();

void emit_struct_meta_base(const std::string &name,
llvm::Value *node_meta,
Expand Down Expand Up @@ -330,6 +333,13 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
return false; // on CPU devices just pass in a pointer
}

// On AMDGPU, byval attribute is disallowed by the calling convention
// verifier. Instead, pass the struct type directly in kernarg and store
// to an alloca at function entry to obtain a pointer.
virtual bool kernel_argument_struct_in_kernarg() const {
return false;
}

std::string init_offloaded_task_function(OffloadedStmt *stmt,
std::string suffix = "");

Expand Down
2 changes: 2 additions & 0 deletions quadrants/program/compile_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ struct CompileConfig {

size_t cuda_stack_limit{0};

bool amdgpu_auto_waves_per_eu{true};

CompileConfig();

void fit();
Expand Down
2 changes: 1 addition & 1 deletion quadrants/program/extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ bool is_extension_supported(Arch arch, Extension ext) {
{Extension::sparse, Extension::quant, Extension::quant_basic,
Extension::data64, Extension::adstack, Extension::bls,
Extension::assertion, Extension::mesh}},
{Arch::amdgpu, {Extension::assertion}},
{Arch::amdgpu, {Extension::assertion, Extension::bls}},
{Arch::metal, {}},
{Arch::vulkan, {}},
};
Expand Down
2 changes: 2 additions & 0 deletions quadrants/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ void export_lang(py::module &m) {
.def_readwrite("default_gpu_block_dim",
&CompileConfig::default_gpu_block_dim)
.def_readwrite("gpu_max_reg", &CompileConfig::gpu_max_reg)
.def_readwrite("amdgpu_auto_waves_per_eu",
&CompileConfig::amdgpu_auto_waves_per_eu)
.def_readwrite("saturating_grid_dim", &CompileConfig::saturating_grid_dim)
.def_readwrite("max_block_dim", &CompileConfig::max_block_dim)
.def_readwrite("cpu_max_num_threads", &CompileConfig::cpu_max_num_threads)
Expand Down
18 changes: 17 additions & 1 deletion quadrants/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,21 @@ AMDGPUContext::AMDGPUContext()
mcpu_ = mcpu_.substr(0, mcpu_.find(":"));
std::free(hip_device_prop);

if (driver_.device_get_default_mem_pool.is_available()) {
void *default_mem_pool = nullptr;
uint32 err = driver_.device_get_default_mem_pool.call_with_warning(
&default_mem_pool, 0);
if (err == HIP_SUCCESS && default_mem_pool != nullptr) {
supports_mem_pool_ = true;
constexpr uint64 kMemPoolReleaseThreshold = 1048576 * 128;
driver_.mem_pool_set_attribute(default_mem_pool,
HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD,
(void *)&kMemPoolReleaseThreshold);
QD_TRACE("HIP memory pool enabled (release threshold: {} bytes)",
kMemPoolReleaseThreshold);
}
}

QD_TRACE("Emitting AMDGPU code for {}", mcpu_);
}

Expand Down Expand Up @@ -175,7 +190,8 @@ void AMDGPUContext::launch(void *func,
bool valid =
offline_cache::try_demangle_name(task_name, primal_task_name, key);
profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name,
func, grid_dim, block_dim, 0);
func, grid_dim, block_dim,
dynamic_shared_mem_bytes);
}

auto context_guard = AMDGPUContext::get_instance().get_guard();
Expand Down
11 changes: 10 additions & 1 deletion quadrants/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class AMDGPUContext {
KernelProfilerBase *profiler_{nullptr};
AMDGPUDriver &driver_;
bool debug_{false};
bool supports_mem_pool_{false};
std::vector<void *> kernel_arg_pointer_;

public:
Expand All @@ -36,13 +37,21 @@ class AMDGPUContext {
return dev_count_ != 0;
}

bool supports_mem_pool() const {
return supports_mem_pool_;
}

void push_back_kernel_arg_pointer(void *ptr) {
kernel_arg_pointer_.push_back(ptr);
}

void free_kernel_arg_pointer() {
for (auto &i : kernel_arg_pointer_) {
AMDGPUDriver::get_instance().mem_free(i);
if (supports_mem_pool_) {
AMDGPUDriver::get_instance().mem_free_async(i, nullptr);
} else {
AMDGPUDriver::get_instance().mem_free(i);
}
}
kernel_arg_pointer_.erase(kernel_arg_pointer_.begin(),
kernel_arg_pointer_.end());
Expand Down
21 changes: 17 additions & 4 deletions quadrants/rhi/amdgpu/amdgpu_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,26 @@ DeviceAllocation AmdgpuDevice::allocate_memory_runtime(
const LlvmRuntimeAllocParams &params) {
AllocInfo info;
info.size = quadrants::iroundup(params.size, quadrants_page_size);
if (params.host_read || params.host_write) {
if (info.size == 0) {
info.ptr = nullptr;
} else if (params.use_memory_pool) {
AMDGPUDriver::get_instance().malloc_async((void **)&info.ptr, info.size,
nullptr);
} else if (params.host_read || params.host_write) {
QD_NOT_IMPLEMENTED
} else {
info.ptr = DeviceMemoryPool::get_instance(Arch::amdgpu,
false /*merge_upon_release*/)
.allocate_with_cache(this, params);
QD_ASSERT(info.ptr != nullptr);
}

if (info.ptr)
AMDGPUDriver::get_instance().memset((void *)info.ptr, 0, info.size);
}

info.is_imported = false;
info.use_cached = true;
info.use_preallocated = true;
info.use_memory_pool = params.use_memory_pool;

DeviceAllocation alloc;
alloc.alloc_id = allocations_.size();
Expand Down Expand Up @@ -98,11 +105,17 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) {

validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];

if (info.size == 0) {
return;
}
if (info.ptr == nullptr) {
QD_ERROR("the DeviceAllocation is already deallocated");
}
QD_ASSERT(!info.is_imported);
if (info.use_cached) {
if (info.use_memory_pool) {
AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr);
} else if (info.use_cached) {
DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/)
.release(info.size, (uint64_t *)info.ptr, false);
} else if (!info.use_preallocated) {
Expand Down
1 change: 1 addition & 0 deletions quadrants/rhi/amdgpu/amdgpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class AmdgpuDevice : public LlvmDevice {
bool is_imported{false};
bool use_preallocated{true};
bool use_cached{false};
bool use_memory_pool{false};
void *mapped{nullptr};
};

Expand Down
16 changes: 16 additions & 0 deletions quadrants/rhi/amdgpu/amdgpu_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,21 @@ AMDGPUDriver &AMDGPUDriver::get_instance() {
return get_instance_without_context();
}

void AMDGPUDriver::malloc_async(void **dev_ptr, size_t size, void *stream) {
if (AMDGPUContext::get_instance().supports_mem_pool()) {
malloc_async_impl(dev_ptr, size, stream);
} else {
malloc(dev_ptr, size);
}
}

void AMDGPUDriver::mem_free_async(void *dev_ptr, void *stream) {
if (AMDGPUContext::get_instance().supports_mem_pool()) {
mem_free_async_impl(dev_ptr, stream);
} else {
mem_free(dev_ptr);
}
}

} // namespace lang
} // namespace quadrants
10 changes: 10 additions & 0 deletions quadrants/rhi/amdgpu/amdgpu_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ constexpr uint32 HIP_STREAM_NON_BLOCKING = 0x1;
constexpr uint32 HIP_MEM_ATTACH_GLOBAL = 0x1;
constexpr uint32 HIP_MEM_ADVISE_SET_PREFERRED_LOCATION = 3;
constexpr uint32 HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 26;
constexpr uint32 HIP_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39;
constexpr uint32 HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 63;
// sizeof(hipDeviceProperties_t) in ROCm 6.
// ROCm 5.7.1 is 792 and ROCm 6 is 1472, so to make both work we use whichever
Expand All @@ -35,6 +36,7 @@ constexpr uint32 HIP_JIT_MAX_REGISTERS = 0;
constexpr uint32 HIP_POINTER_ATTRIBUTE_MEMORY_TYPE = 2;
constexpr uint32 HIP_SUCCESS = 0;
constexpr uint32 HIP_MEMORYTYPE_DEVICE = 1;
constexpr uint32 HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD = 4;

std::string get_amdgpu_error_message(uint32 err);

Expand Down Expand Up @@ -70,6 +72,10 @@ class AMDGPUFunction {
fmt::format(" while calling {} ({})", name_, symbol_name_);
}

bool is_available() const {
return function_ != nullptr;
}

uint32 call_with_warning(Args... args) {
auto err = call(args...);
QD_WARN_IF(err, "{}", get_error_message(err));
Expand Down Expand Up @@ -117,6 +123,10 @@ class AMDGPUDriver : protected AMDGPUDriverBase {

void (*runtime_get_version)(int *);

void malloc_async(void **ptr, size_t size, void *stream);

void mem_free_async(void *ptr, void *stream);

bool detected();

static AMDGPUDriver &get_instance();
Expand Down
11 changes: 11 additions & 0 deletions quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,25 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async,
std::size_t,
void *);
PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *);
PER_AMDGPU_FUNCTION(malloc_managed,
hipMallocManaged,
void **,
std::size_t,
uint32);
PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
PER_AMDGPU_FUNCTION(mem_free, hipFree, void *);
PER_AMDGPU_FUNCTION(mem_free_async_impl, hipFreeAsync, void *, void *);
PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *);
PER_AMDGPU_FUNCTION(device_get_default_mem_pool,
hipDeviceGetDefaultMemPool,
void **,
int);
PER_AMDGPU_FUNCTION(mem_pool_set_attribute,
hipMemPoolSetAttribute,
void *,
uint32,
void *);
PER_AMDGPU_FUNCTION(mem_get_attribute,
hipPointerGetAttribute,
void *,
Expand Down
Loading