diff --git a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp index 15a23a89fe..55cd8f2211 100644 --- a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp +++ b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp @@ -17,6 +17,7 @@ #include "quadrants/ir/analysis.h" #include "quadrants/ir/transforms.h" #include "quadrants/codegen/codegen_utils.h" +#include "quadrants/inc/constants.h" namespace quadrants { namespace lang { @@ -26,6 +27,8 @@ using namespace llvm; class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { public: using IRVisitor::visit; + size_t dynamic_shared_array_bytes{0}; + TaskCodeGenAMDGPU(int id, const CompileConfig &config, QuadrantsLLVMContext &tlctx, @@ -48,6 +51,37 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { // We'll just ignore it } + // Dynamic shared memory promotion + void visit(AllocaStmt *stmt) override { + auto tensor_type = stmt->ret_type.ptr_removed()->cast(); + if (tensor_type && stmt->is_shared) { + size_t shared_array_bytes = + tensor_type->get_num_elements() * + data_type_size(tensor_type->get_element_type()); + if (shared_array_bytes > cuda_dynamic_shared_array_threshold_bytes) { + if (dynamic_shared_array_bytes > 0) { + QD_ERROR( + "Only one single large shared array instance is allowed in " + "current version.") + } + tensor_type->set_shape(std::vector({0})); + dynamic_shared_array_bytes += shared_array_bytes; + } + + auto type = tlctx->get_data_type(tensor_type); + auto base = new llvm::GlobalVariable( + *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr, + fmt::format("shared_array_t{}_s{}", task_codegen_id, stmt->id), + nullptr, llvm::GlobalVariable::NotThreadLocal, + 3 /*addrspace=LDS*/); + base->setAlignment(llvm::MaybeAlign(8)); + auto ptr_type = llvm::PointerType::get(type, 0); + llvm_val[stmt] = builder->CreatePointerCast(base, ptr_type); + } else { + TaskCodeGenLLVM::visit(stmt); + } + } + void emit_extra_unary(UnaryOpStmt *stmt) override { auto input = llvm_val[stmt->operand]; auto input_quadrants_type = stmt->operand->ret_type; @@ -79,7 +113,8 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { } else { QD_NOT_IMPLEMENTED } - } // TODO simplify the impl of sgn + } + // Branchless sgn using select (no alloca/scratch) else if (op == UnaryOpType::sgn) { if (input_quadrants_type->is_primitive(PrimitiveTypeID::i32)) { auto ashr = builder->CreateAShr(input, 31); @@ -87,117 +122,33 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { auto lshr = builder->CreateLShr(sub, 31); llvm_val[stmt] = builder->CreateOr(ashr, lshr); } else if (input_quadrants_type->is_primitive(PrimitiveTypeID::f32)) { - auto func = builder->GetInsertBlock()->getParent(); - auto bb_oeq_then = BasicBlock::Create(*llvm_context, "oeq_then", func); - auto bb_oeq_else = BasicBlock::Create(*llvm_context, "oeq_else"); - auto bb_merge = BasicBlock::Create(*llvm_context, "merge"); - auto bb_olt_then = BasicBlock::Create(*llvm_context, "olt_then", func); - auto bb_olt_else = BasicBlock::Create(*llvm_context, "olt_else"); - - auto alloc = builder->CreateAlloca( - llvm::Type::getFloatTy(*llvm_context), (unsigned)5); - auto newty = llvm::PointerType::get( - llvm::Type::getFloatTy(*llvm_context), (unsigned)0); - auto cast = builder->CreateAddrSpaceCast(alloc, newty); - auto fcmp_oeq = builder->CreateFCmpOEQ( - input, - llvm::ConstantFP::get(llvm::Type::getFloatTy(*llvm_context), 0)); - builder->CreateCondBr(fcmp_oeq, bb_oeq_then, bb_oeq_else); - builder->SetInsertPoint(bb_oeq_then); - builder->CreateStore( - llvm::ConstantFP::get(llvm::Type::getFloatTy(*llvm_context), 0), - cast); - builder->CreateBr(bb_merge); - bb_oeq_then = builder->GetInsertBlock(); - - func->insert(func->end(), bb_oeq_else); - builder->SetInsertPoint(bb_oeq_else); - auto fcmp_olt = builder->CreateFCmpOLT( - input, - llvm::ConstantFP::get(llvm::Type::getFloatTy(*llvm_context), 0)); - builder->CreateCondBr(fcmp_olt, bb_olt_then, bb_olt_else); - bb_oeq_else = builder->GetInsertBlock(); - - builder->SetInsertPoint(bb_olt_then); - builder->CreateStore( - llvm::ConstantFP::get(llvm::Type::getFloatTy(*llvm_context), -1), - cast); - builder->CreateBr(bb_merge); - bb_olt_then = builder->GetInsertBlock(); - - func->insert(func->end(), bb_olt_else); - builder->SetInsertPoint(bb_olt_else); - builder->CreateStore( - llvm::ConstantFP::get(llvm::Type::getFloatTy(*llvm_context), 1), - cast); - builder->CreateBr(bb_merge); - bb_olt_else = builder->GetInsertBlock(); - - func->insert(func->end(), bb_merge); - builder->SetInsertPoint(bb_merge); - llvm_val[stmt] = - builder->CreateLoad(llvm::Type::getFloatTy(*llvm_context), cast); + auto *float_ty = llvm::Type::getFloatTy(*llvm_context); + auto *zero = llvm::ConstantFP::get(float_ty, 0.0); + auto *neg_one = llvm::ConstantFP::get(float_ty, -1.0); + auto *pos_one = llvm::ConstantFP::get(float_ty, 1.0); + auto *is_neg = builder->CreateFCmpOLT(input, zero); + auto *is_zero = builder->CreateFCmpOEQ(input, zero); + auto *neg_or_pos = builder->CreateSelect(is_neg, neg_one, pos_one); + llvm_val[stmt] = builder->CreateSelect(is_zero, zero, neg_or_pos); } else if (input_quadrants_type->is_primitive(PrimitiveTypeID::f64)) { - auto func = builder->GetInsertBlock()->getParent(); - auto bb_oeq_then = BasicBlock::Create(*llvm_context, "oeq_then", func); - auto bb_oeq_else = BasicBlock::Create(*llvm_context, "oeq_else"); - auto bb_merge = BasicBlock::Create(*llvm_context, "merge"); - auto bb_olt_then = BasicBlock::Create(*llvm_context, "olt_then", func); - auto bb_olt_else = BasicBlock::Create(*llvm_context, "olt_else"); - - auto alloc = builder->CreateAlloca( - llvm::Type::getDoubleTy(*llvm_context), (unsigned)5); - auto newty = llvm::PointerType::get( - llvm::Type::getDoubleTy(*llvm_context), (unsigned)0); - auto cast = builder->CreateAddrSpaceCast(alloc, newty); - auto fcmp_oeq = builder->CreateFCmpOEQ( - input, - llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0)); - builder->CreateCondBr(fcmp_oeq, bb_oeq_then, bb_oeq_else); - builder->SetInsertPoint(bb_oeq_then); - builder->CreateStore( - llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0), - cast); - builder->CreateBr(bb_merge); - bb_oeq_then = builder->GetInsertBlock(); - - func->insert(func->end(), bb_oeq_else); - builder->SetInsertPoint(bb_oeq_else); - auto fcmp_olt = builder->CreateFCmpOLT( - input, - llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0)); - builder->CreateCondBr(fcmp_olt, bb_olt_then, bb_olt_else); - bb_oeq_else = builder->GetInsertBlock(); - - builder->SetInsertPoint(bb_olt_then); - builder->CreateStore( - llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), -1), - cast); - builder->CreateBr(bb_merge); - bb_olt_then = builder->GetInsertBlock(); - - func->insert(func->end(), bb_olt_else); - builder->SetInsertPoint(bb_olt_else); - builder->CreateStore( - llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 1), - cast); - builder->CreateBr(bb_merge); - bb_olt_else = builder->GetInsertBlock(); - - func->insert(func->end(), bb_merge); - builder->SetInsertPoint(bb_merge); - llvm_val[stmt] = - builder->CreateLoad(llvm::Type::getDoubleTy(*llvm_context), cast); + auto *double_ty = llvm::Type::getDoubleTy(*llvm_context); + auto *zero = llvm::ConstantFP::get(double_ty, 0.0); + auto *neg_one = llvm::ConstantFP::get(double_ty, -1.0); + auto *pos_one = llvm::ConstantFP::get(double_ty, 1.0); + auto *is_neg = builder->CreateFCmpOLT(input, zero); + auto *is_zero = builder->CreateFCmpOEQ(input, zero); + auto *neg_or_pos = builder->CreateSelect(is_neg, neg_one, pos_one); + llvm_val[stmt] = builder->CreateSelect(is_zero, zero, neg_or_pos); } } UNARY_STD(cos) - UNARY_STD(acos) UNARY_STD(sin) + UNARY_STD(log) + UNARY_STD(acos) UNARY_STD(asin) UNARY_STD(tan) UNARY_STD(tanh) UNARY_STD(exp) - UNARY_STD(log) UNARY_STD(sqrt) else { QD_P(unary_op_type_name(op)); @@ -267,7 +218,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { auto [begin, end] = get_range_for_bounds(stmt); call("gpu_parallel_range_for", - {get_arg(0), begin, end, tls_prologue, body, epilogue, + {get_context(), begin, end, tls_prologue, body, epilogue, tlctx->get_constant(stmt->tls_size)}); } @@ -307,42 +258,147 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { } bool kernel_argument_by_val() const override { - // on AMDGPU, pass the argument by value is not allowed return false; } + bool kernel_argument_struct_in_kernarg() const override { + return true; + } + + // SNode root pointers are hipMalloc'd global memory. Cast result + // to addrspace(1) so GEP chains produce global_load after inlining. + void visit(GetRootStmt *stmt) override { + TaskCodeGenLLVM::visit(stmt); + auto *ptr_as1 = llvm::PointerType::get(*llvm_context, 1); + llvm_val[stmt] = builder->CreateAddrSpaceCast(llvm_val[stmt], ptr_as1); + } + + void visit(SNodeLookupStmt *stmt) override { + // Cast addrspace(1) input to addrspace(0) for base visitor's + // runtime function calls, then cast result back to addrspace(1). + auto *input = llvm_val[stmt->input_snode]; + if (input && input->getType()->isPointerTy() && + input->getType()->getPointerAddressSpace() == 1) { + auto *ptr_as0 = llvm::PointerType::getUnqual(*llvm_context); + llvm_val[stmt->input_snode] = + builder->CreateAddrSpaceCast(input, ptr_as0); + } + TaskCodeGenLLVM::visit(stmt); + llvm_val[stmt->input_snode] = input; + if (llvm_val[stmt] && llvm_val[stmt]->getType()->isPointerTy() && + llvm_val[stmt]->getType()->getPointerAddressSpace() == 0) { + auto *ptr_as1 = llvm::PointerType::get(*llvm_context, 1); + llvm_val[stmt] = builder->CreateAddrSpaceCast(llvm_val[stmt], ptr_as1); + } + } + + void visit(GetChStmt *stmt) override { + if (stmt->input_snode->type == SNodeType::quant_array || + stmt->ret_type->as()->is_bit_pointer()) { + TaskCodeGenLLVM::visit(stmt); + return; + } + auto *input = llvm_val[stmt->input_ptr]; + if (input && input->getType()->isPointerTy() && + input->getType()->getPointerAddressSpace() == 1) { + auto *ptr_as0 = llvm::PointerType::getUnqual(*llvm_context); + llvm_val[stmt->input_ptr] = + builder->CreateAddrSpaceCast(input, ptr_as0); + } + TaskCodeGenLLVM::visit(stmt); + llvm_val[stmt->input_ptr] = input; + if (llvm_val[stmt] && llvm_val[stmt]->getType()->isPointerTy() && + llvm_val[stmt]->getType()->getPointerAddressSpace() == 0) { + auto *ptr_as1 = llvm::PointerType::get(*llvm_context, 1); + llvm_val[stmt] = builder->CreateAddrSpaceCast(llvm_val[stmt], ptr_as1); + } + } + + llvm::Value *get_runtime() override { + auto *runtime_context_ty = get_runtime_type("RuntimeContext"); + auto *runtime_ptr_addr = builder->CreateStructGEP( + runtime_context_ty, TaskCodeGenLLVM::get_context(), 1); + auto *runtime_ty = + llvm::PointerType::get(get_runtime_type("LLVMRuntime"), 0); + auto *runtime_ptr = builder->CreateLoad(runtime_ty, runtime_ptr_addr); + auto *invariant_load_metadata = + llvm::MDNode::get(builder->getContext(), {}); + runtime_ptr->setMetadata(llvm::LLVMContext::MD_invariant_load, + invariant_load_metadata); + return runtime_ptr; + } + + // Read-only cache loads via invariant.load metadata + llvm::Value *create_intrinsic_load(llvm::Value *ptr, + llvm::Type *ty) override { + auto *ptr_ty_addrspace_1 = llvm::PointerType::get(ty, 1); + auto *cast_ptr = builder->CreateAddrSpaceCast(ptr, ptr_ty_addrspace_1); + auto *load = builder->CreateLoad(ty, cast_ptr); + auto *invariant_load_metadata = + llvm::MDNode::get(builder->getContext(), {}); + load->setMetadata(llvm::LLVMContext::MD_invariant_load, + invariant_load_metadata); + return load; + } + void visit(GlobalLoadStmt *stmt) override { auto ptr = llvm_val[stmt->src]; auto ptr_type = stmt->src->ret_type->as(); if (ptr_type->is_bit_pointer()) { - auto val_type = ptr_type->get_pointee_type(); - auto get_ch = stmt->src->as(); - auto physical_type = - tlctx->get_data_type(get_ch->input_snode->physical_type); - auto [byte_ptr, bit_offset] = load_bit_ptr(ptr); - auto physical_value = builder->CreateLoad(physical_type, byte_ptr); - if (auto qit = val_type->cast()) { - llvm_val[stmt] = extract_quant_int(physical_value, bit_offset, qit); - } else if (auto qfxt = val_type->cast()) { - qit = qfxt->get_digits_type()->as(); - auto digits = extract_quant_int(physical_value, bit_offset, qit); - llvm_val[stmt] = reconstruct_quant_fixed(digits, qfxt); + if (auto get_ch = stmt->src->cast()) { + bool should_cache_as_read_only = + current_offload->mem_access_opt.has_flag( + get_ch->output_snode, SNodeAccessFlag::read_only); + create_global_load(stmt, should_cache_as_read_only); } else { - QD_ASSERT(val_type->is()); - QD_ASSERT(get_ch->input_snode->dt->is()); - llvm_val[stmt] = extract_quant_float( - physical_value, get_ch->input_snode->dt->as(), - get_ch->output_snode->id_in_bit_struct); + create_global_load(stmt, false); + } + } else { + // SNode data lives in hipMalloc'd global memory. Cast pointer to + // addrspace(1) so LLVM emits global_load instead of flat_load, + // avoiding the FLAT unit's runtime address-space resolution. + auto *load_ty = tlctx->get_data_type(stmt->ret_type); + bool read_only = false; + if (auto get_ch = stmt->src->cast()) { + read_only = current_offload->mem_access_opt.has_flag( + get_ch->output_snode, SNodeAccessFlag::read_only); } + auto *ptr_as1 = llvm::PointerType::get(load_ty, 1); + auto *cast_ptr = builder->CreateAddrSpaceCast(ptr, ptr_as1); + auto *load = builder->CreateLoad(load_ty, cast_ptr); + if (read_only) { + auto *md = llvm::MDNode::get(builder->getContext(), {}); + load->setMetadata(llvm::LLVMContext::MD_invariant_load, md); + } + llvm_val[stmt] = load; + } + } + + void visit(GlobalStoreStmt *stmt) override { + QD_ASSERT(llvm_val[stmt->val]); + QD_ASSERT(llvm_val[stmt->dest]); + auto ptr_type = stmt->dest->ret_type->as(); + if (ptr_type->is_bit_pointer()) { + TaskCodeGenLLVM::visit(stmt); } else { - // Byte pointer case. - llvm_val[stmt] = - builder->CreateLoad(tlctx->get_data_type(stmt->ret_type), ptr); + // Cast to addrspace(1) for global_store instead of flat_store. + auto *val_ty = llvm_val[stmt->val]->getType(); + auto *ptr_as1 = llvm::PointerType::get(val_ty, 1); + auto *cast_ptr = + builder->CreateAddrSpaceCast(llvm_val[stmt->dest], ptr_as1); + builder->CreateStore(llvm_val[stmt->val], cast_ptr); } } + // BLS / shared memory buffer allocation void create_bls_buffer(OffloadedStmt *stmt) { - QD_NOT_IMPLEMENTED + auto type = llvm::ArrayType::get( + llvm::Type::getInt8Ty(*llvm_context), stmt->bls_size); + bls_buffer = new llvm::GlobalVariable( + *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr, + "bls_buffer", nullptr, llvm::GlobalVariable::NotThreadLocal, + 3 /*addrspace=LDS*/); + bls_buffer->setAlignment(llvm::MaybeAlign(8)); } void visit(OffloadedStmt *stmt) override { @@ -384,22 +440,26 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { } } if (stmt->task_type == Type::listgen) { - // Note: 32 is a temporary number - // TODO: find a func to obtain this attr - int query_max_block_per_sm = 32; - // AMDGPUDriver::get_instance().device_get_attribute( - // &query_max_block_per_sm, - // HIP_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR, nullptr); int num_SMs; AMDGPUDriver::get_instance().device_get_attribute( &num_SMs, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0); + int max_threads_per_sm = 0; + AMDGPUDriver::get_instance().device_get_attribute( + &max_threads_per_sm, + HIP_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0); + int query_max_block_per_sm = + (max_threads_per_sm > 0 && stmt->block_dim > 0) + ? (max_threads_per_sm / stmt->block_dim) + : 32; current_task->grid_dim = num_SMs * query_max_block_per_sm; } current_task->block_dim = stmt->block_dim; + current_task->dynamic_shared_array_bytes = dynamic_shared_array_bytes; QD_ASSERT(current_task->grid_dim != 0); QD_ASSERT(current_task->block_dim != 0); offloaded_tasks.push_back(*current_task); current_task = nullptr; + dynamic_shared_array_bytes = 0; } current_offload = nullptr; #else diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp index 700acbd596..1d8dccd812 100644 --- a/quadrants/codegen/llvm/codegen_llvm.cpp +++ b/quadrants/codegen/llvm/codegen_llvm.cpp @@ -38,6 +38,8 @@ FunctionCreationGuard::FunctionCreationGuard( llvm::Function::InternalLinkage, func_name, mb->module.get()); old_func = mb->func; + old_context_val_alloca = mb->context_val_alloca_; + mb->context_val_alloca_ = nullptr; // emit into loop body function mb->func = body; @@ -74,6 +76,7 @@ FunctionCreationGuard::~FunctionCreationGuard() { mb->entry_block = old_entry; mb->final_block = old_final; mb->func = old_func; + mb->context_val_alloca_ = old_context_val_alloca; mb->builder->restoreIP(ip); QD_ASSERT(!llvm::verifyFunction(*body, &llvm::errs())); @@ -2015,9 +2018,15 @@ std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt, current_loop_reentry = nullptr; current_while_after_loop = nullptr; + llvm::Type *context_param_type; + if (kernel_argument_struct_in_kernarg()) { + context_param_type = context_ty; + } else { + context_param_type = llvm::PointerType::get(context_ty, 0); + } task_function_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*llvm_context), - {llvm::PointerType::get(context_ty, 0)}, false); + {context_param_type}, false); auto task_kernel_name = fmt::format( "{}_{}_{}{}", kernel_name, task_codegen_id, stmt->task_name(), suffix); @@ -2041,6 +2050,13 @@ std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt, // The real function body func_body_bb = llvm::BasicBlock::Create(*llvm_context, "body", func); builder->SetInsertPoint(func_body_bb); + + if (kernel_argument_struct_in_kernarg()) { + context_val_alloca_ = create_entry_block_alloca(context_ty); + builder->CreateStore(kernel_args[0], context_val_alloca_); + } else { + context_val_alloca_ = nullptr; + } return task_kernel_name; } @@ -2627,6 +2643,8 @@ llvm::Value *TaskCodeGenLLVM::get_arg(int i) { } llvm::Value *TaskCodeGenLLVM::get_context() { + if (context_val_alloca_) + return context_val_alloca_; return get_arg(0); } @@ -2732,7 +2750,7 @@ LLVMCompiledTask TaskCodeGenLLVM::run_compilation() { for (const auto &task : offloaded_tasks) { llvm::Function *func = module->getFunction(task.name); QD_ASSERT(func); - tlctx->mark_function_as_amdgpu_kernel(func); + tlctx->mark_function_as_amdgpu_kernel(func, task.block_dim); } #if defined(QD_WITH_AMDGPU) llvm::legacy::FunctionPassManager fpm(module.get()); diff --git a/quadrants/codegen/llvm/codegen_llvm.h b/quadrants/codegen/llvm/codegen_llvm.h index 275dcd65c0..5f072e6b59 100644 --- a/quadrants/codegen/llvm/codegen_llvm.h +++ b/quadrants/codegen/llvm/codegen_llvm.h @@ -22,6 +22,7 @@ class FunctionCreationGuard { llvm::Function *body; llvm::BasicBlock *old_entry, *allocas, *entry, *old_final, *final; llvm::IRBuilder<>::InsertPoint ip; + llvm::Value *old_context_val_alloca{nullptr}; FunctionCreationGuard(TaskCodeGenLLVM *mb, std::vector arguments, @@ -97,6 +98,8 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder { llvm::Value *get_tls_base_ptr(); + llvm::Value *context_val_alloca_{nullptr}; + llvm::Type *get_tls_buffer_type(); std::vector get_xlogue_argument_types(); @@ -111,7 +114,7 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder { llvm::Value *get_root(int snode_tree_id); - llvm::Value *get_runtime(); + virtual llvm::Value *get_runtime(); void emit_struct_meta_base(const std::string &name, llvm::Value *node_meta, @@ -330,6 +333,13 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder { return false; // on CPU devices just pass in a pointer } + // On AMDGPU, byval attribute is disallowed by the calling convention + // verifier. Instead, pass the struct type directly in kernarg and store + // to an alloca at function entry to obtain a pointer. + virtual bool kernel_argument_struct_in_kernarg() const { + return false; + } + std::string init_offloaded_task_function(OffloadedStmt *stmt, std::string suffix = ""); diff --git a/quadrants/program/compile_config.h b/quadrants/program/compile_config.h index ea49cebb75..0b1e55b9a1 100644 --- a/quadrants/program/compile_config.h +++ b/quadrants/program/compile_config.h @@ -102,6 +102,8 @@ struct CompileConfig { size_t cuda_stack_limit{0}; + bool amdgpu_auto_waves_per_eu{true}; + CompileConfig(); void fit(); diff --git a/quadrants/program/extension.cpp b/quadrants/program/extension.cpp index 03374fddc9..4da7b3ab63 100644 --- a/quadrants/program/extension.cpp +++ b/quadrants/program/extension.cpp @@ -19,7 +19,7 @@ bool is_extension_supported(Arch arch, Extension ext) { {Extension::sparse, Extension::quant, Extension::quant_basic, Extension::data64, Extension::adstack, Extension::bls, Extension::assertion, Extension::mesh}}, - {Arch::amdgpu, {Extension::assertion}}, + {Arch::amdgpu, {Extension::assertion, Extension::bls}}, {Arch::metal, {}}, {Arch::vulkan, {}}, }; diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp index abbba729a4..9d1f21f409 100644 --- a/quadrants/python/export_lang.cpp +++ b/quadrants/python/export_lang.cpp @@ -201,6 +201,8 @@ void export_lang(py::module &m) { .def_readwrite("default_gpu_block_dim", &CompileConfig::default_gpu_block_dim) .def_readwrite("gpu_max_reg", &CompileConfig::gpu_max_reg) + .def_readwrite("amdgpu_auto_waves_per_eu", + &CompileConfig::amdgpu_auto_waves_per_eu) .def_readwrite("saturating_grid_dim", &CompileConfig::saturating_grid_dim) .def_readwrite("max_block_dim", &CompileConfig::max_block_dim) .def_readwrite("cpu_max_num_threads", &CompileConfig::cpu_max_num_threads) diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp index 22f55339ee..4880cc1fb0 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp @@ -84,6 +84,21 @@ AMDGPUContext::AMDGPUContext() mcpu_ = mcpu_.substr(0, mcpu_.find(":")); std::free(hip_device_prop); + if (driver_.device_get_default_mem_pool.is_available()) { + void *default_mem_pool = nullptr; + uint32 err = driver_.device_get_default_mem_pool.call_with_warning( + &default_mem_pool, 0); + if (err == HIP_SUCCESS && default_mem_pool != nullptr) { + supports_mem_pool_ = true; + constexpr uint64 kMemPoolReleaseThreshold = 1048576 * 128; + driver_.mem_pool_set_attribute(default_mem_pool, + HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD, + (void *)&kMemPoolReleaseThreshold); + QD_TRACE("HIP memory pool enabled (release threshold: {} bytes)", + kMemPoolReleaseThreshold); + } + } + QD_TRACE("Emitting AMDGPU code for {}", mcpu_); } @@ -175,7 +190,8 @@ void AMDGPUContext::launch(void *func, bool valid = offline_cache::try_demangle_name(task_name, primal_task_name, key); profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name, - func, grid_dim, block_dim, 0); + func, grid_dim, block_dim, + dynamic_shared_mem_bytes); } auto context_guard = AMDGPUContext::get_instance().get_guard(); diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index 9529953bf1..e2d6ded142 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -23,6 +23,7 @@ class AMDGPUContext { KernelProfilerBase *profiler_{nullptr}; AMDGPUDriver &driver_; bool debug_{false}; + bool supports_mem_pool_{false}; std::vector kernel_arg_pointer_; public: @@ -36,13 +37,21 @@ class AMDGPUContext { return dev_count_ != 0; } + bool supports_mem_pool() const { + return supports_mem_pool_; + } + void push_back_kernel_arg_pointer(void *ptr) { kernel_arg_pointer_.push_back(ptr); } void free_kernel_arg_pointer() { for (auto &i : kernel_arg_pointer_) { - AMDGPUDriver::get_instance().mem_free(i); + if (supports_mem_pool_) { + AMDGPUDriver::get_instance().mem_free_async(i, nullptr); + } else { + AMDGPUDriver::get_instance().mem_free(i); + } } kernel_arg_pointer_.erase(kernel_arg_pointer_.begin(), kernel_arg_pointer_.end()); diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp index 146ecf96c4..4bc7d8fbc9 100644 --- a/quadrants/rhi/amdgpu/amdgpu_device.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp @@ -56,19 +56,26 @@ DeviceAllocation AmdgpuDevice::allocate_memory_runtime( const LlvmRuntimeAllocParams ¶ms) { AllocInfo info; info.size = quadrants::iroundup(params.size, quadrants_page_size); - if (params.host_read || params.host_write) { + if (info.size == 0) { + info.ptr = nullptr; + } else if (params.use_memory_pool) { + AMDGPUDriver::get_instance().malloc_async((void **)&info.ptr, info.size, + nullptr); + } else if (params.host_read || params.host_write) { QD_NOT_IMPLEMENTED } else { info.ptr = DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/) .allocate_with_cache(this, params); - QD_ASSERT(info.ptr != nullptr); + } + if (info.ptr) AMDGPUDriver::get_instance().memset((void *)info.ptr, 0, info.size); - } + info.is_imported = false; info.use_cached = true; info.use_preallocated = true; + info.use_memory_pool = params.use_memory_pool; DeviceAllocation alloc; alloc.alloc_id = allocations_.size(); @@ -98,11 +105,17 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) { validate_device_alloc(handle); AllocInfo &info = allocations_[handle.alloc_id]; + + if (info.size == 0) { + return; + } if (info.ptr == nullptr) { QD_ERROR("the DeviceAllocation is already deallocated"); } QD_ASSERT(!info.is_imported); - if (info.use_cached) { + if (info.use_memory_pool) { + AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr); + } else if (info.use_cached) { DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/) .release(info.size, (uint64_t *)info.ptr, false); } else if (!info.use_preallocated) { diff --git a/quadrants/rhi/amdgpu/amdgpu_device.h b/quadrants/rhi/amdgpu/amdgpu_device.h index bdb15294d3..d00613b3ba 100644 --- a/quadrants/rhi/amdgpu/amdgpu_device.h +++ b/quadrants/rhi/amdgpu/amdgpu_device.h @@ -75,6 +75,7 @@ class AmdgpuDevice : public LlvmDevice { bool is_imported{false}; bool use_preallocated{true}; bool use_cached{false}; + bool use_memory_pool{false}; void *mapped{nullptr}; }; diff --git a/quadrants/rhi/amdgpu/amdgpu_driver.cpp b/quadrants/rhi/amdgpu/amdgpu_driver.cpp index 81c2da7351..5210d6b2a2 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_driver.cpp @@ -79,5 +79,21 @@ AMDGPUDriver &AMDGPUDriver::get_instance() { return get_instance_without_context(); } +void AMDGPUDriver::malloc_async(void **dev_ptr, size_t size, void *stream) { + if (AMDGPUContext::get_instance().supports_mem_pool()) { + malloc_async_impl(dev_ptr, size, stream); + } else { + malloc(dev_ptr, size); + } +} + +void AMDGPUDriver::mem_free_async(void *dev_ptr, void *stream) { + if (AMDGPUContext::get_instance().supports_mem_pool()) { + mem_free_async_impl(dev_ptr, stream); + } else { + mem_free(dev_ptr); + } +} + } // namespace lang } // namespace quadrants diff --git a/quadrants/rhi/amdgpu/amdgpu_driver.h b/quadrants/rhi/amdgpu/amdgpu_driver.h index bb2d83ff24..94f78a9b86 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver.h @@ -13,6 +13,7 @@ constexpr uint32 HIP_STREAM_NON_BLOCKING = 0x1; constexpr uint32 HIP_MEM_ATTACH_GLOBAL = 0x1; constexpr uint32 HIP_MEM_ADVISE_SET_PREFERRED_LOCATION = 3; constexpr uint32 HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 26; +constexpr uint32 HIP_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39; constexpr uint32 HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 63; // sizeof(hipDeviceProperties_t) in ROCm 6. // ROCm 5.7.1 is 792 and ROCm 6 is 1472, so to make both work we use whichever @@ -35,6 +36,7 @@ constexpr uint32 HIP_JIT_MAX_REGISTERS = 0; constexpr uint32 HIP_POINTER_ATTRIBUTE_MEMORY_TYPE = 2; constexpr uint32 HIP_SUCCESS = 0; constexpr uint32 HIP_MEMORYTYPE_DEVICE = 1; +constexpr uint32 HIP_MEMPOOL_ATTR_RELEASE_THRESHOLD = 4; std::string get_amdgpu_error_message(uint32 err); @@ -70,6 +72,10 @@ class AMDGPUFunction { fmt::format(" while calling {} ({})", name_, symbol_name_); } + bool is_available() const { + return function_ != nullptr; + } + uint32 call_with_warning(Args... args) { auto err = call(args...); QD_WARN_IF(err, "{}", get_error_message(err)); @@ -117,6 +123,10 @@ class AMDGPUDriver : protected AMDGPUDriverBase { void (*runtime_get_version)(int *); + void malloc_async(void **ptr, size_t size, void *stream); + + void mem_free_async(void *ptr, void *stream); + bool detected(); static AMDGPUDriver &get_instance(); diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index dbb3612c87..d4cabe73a2 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -69,6 +69,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); +PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, @@ -76,7 +77,17 @@ PER_AMDGPU_FUNCTION(malloc_managed, uint32); PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); PER_AMDGPU_FUNCTION(mem_free, hipFree, void *); +PER_AMDGPU_FUNCTION(mem_free_async_impl, hipFreeAsync, void *, void *); PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *); +PER_AMDGPU_FUNCTION(device_get_default_mem_pool, + hipDeviceGetDefaultMemPool, + void **, + int); +PER_AMDGPU_FUNCTION(mem_pool_set_attribute, + hipMemPoolSetAttribute, + void *, + uint32, + void *); PER_AMDGPU_FUNCTION(mem_get_attribute, hipPointerGetAttribute, void *, diff --git a/quadrants/runtime/amdgpu/jit_amdgpu.cpp b/quadrants/runtime/amdgpu/jit_amdgpu.cpp index 11bf5c6a4c..a03336d180 100644 --- a/quadrants/runtime/amdgpu/jit_amdgpu.cpp +++ b/quadrants/runtime/amdgpu/jit_amdgpu.cpp @@ -1,9 +1,14 @@ #include "quadrants/runtime/amdgpu/jit_amdgpu.h" #include "quadrants/runtime/llvm/llvm_context.h" #include "quadrants/runtime/llvm/llvm_context_pass.h" +#include "quadrants/rhi/amdgpu/amdgpu_types.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Scalar/LoopStrengthReduce.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" +#include "llvm/Transforms/Utils.h" #include #include @@ -13,6 +18,18 @@ namespace lang { #if defined(QD_WITH_AMDGPU) JITModule *JITSessionAMDGPU ::add_module(std::unique_ptr M, int max_reg) { + // HSACo caching + auto cache_key = compute_module_cache_key(M.get()); + auto cache_it = hsaco_cache_.find(cache_key); + if (cache_it != hsaco_cache_.end()) { + QD_TRACE("HSACo cache hit for key {}", cache_key.substr(0, 16)); + void *amdgpu_module; + AMDGPUDriver::get_instance().module_load_data(&amdgpu_module, + cache_it->second.c_str()); + modules.push_back(std::make_unique(amdgpu_module)); + return modules.back().get(); + } + auto hsaco = compile_module_to_hsaco(M); QD_TRACE("hsaco size: {:.2f}KB", hsaco.size() / 1024.0); @@ -21,6 +38,9 @@ JITModule *JITSessionAMDGPU ::add_module(std::unique_ptr M, AMDGPUDriver::get_instance().module_load_data(&amdgpu_module, hsaco.c_str()); QD_TRACE("AMDGPU load data from module time : {}ms", (Time::get_time() - t) * 1000); + + hsaco_cache_[cache_key] = hsaco; + modules.push_back(std::make_unique(amdgpu_module)); return modules.back().get(); } @@ -36,6 +56,45 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco( function_pass_manager_addrcast.run(*func); function_pass_manager_addrcast.doFinalization(); + for (auto &F : *llvm_module) { + // Match CUDA parity: jit_cuda.cpp:332-335 unconditionally applies + // unsafe-fp-math to ALL functions via hardcoded kFTZDenorms=1. + // Enables FMA contraction, reciprocal for division, and operation + // reordering. Applied to all functions (not just kernels) because + // internal body functions contain the actual FP compute. + F.addFnAttr("unsafe-fp-math", "true"); + F.addFnAttr("no-signed-zeros-fp-math", "true"); + + if (F.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) { + const std::string kernel_name = F.getName().str(); + const bool is_lightweight_cg_subkernel = + kernel_name.find("_kernel_cg_only_save_prev_grad") != + std::string::npos || + kernel_name.find("_kernel_update_constraint_forces") != + std::string::npos || + kernel_name.find("_kernel_update_constraint_qfrc") != + std::string::npos || + kernel_name.find("_kernel_update_constraint_cost") != + std::string::npos || + kernel_name.find("_kernel_update_search_direction") != + std::string::npos; + + if (!is_lightweight_cg_subkernel) { + F.addFnAttr("amdgpu-waves-per-eu", "1,2"); + } + F.addFnAttr("uniform-work-group-size", "true"); + F.addFnAttr("amdgpu-ieee", "false"); + F.addFnAttr("amdgpu-dx10-clamp", "false"); + } + } + + auto *daz_type = llvm::Type::getInt8Ty(llvm_module->getContext()); + auto *daz_init = llvm::ConstantInt::get(daz_type, 1); + auto *daz_var = new llvm::GlobalVariable( + *llvm_module, daz_type, true, llvm::GlobalValue::LinkOnceODRLinkage, + daz_init, "__oclc_daz_opt"); + daz_var->setVisibility(llvm::GlobalValue::HiddenVisibility); + if (llvm::verifyModule(*llvm_module, &llvm::errs())) { llvm_module->print(llvm::errs(), nullptr); QD_WARN("Module broken"); @@ -54,15 +113,17 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco( llvm::TargetOptions options; options.MCOptions.AsmVerbose = false; + // FMA contraction always enabled to match CUDA's unconditional + // unsafe-fp-math=true (function attribute overrides TargetOptions, + // but setting Fast here ensures consistent behavior across all + // AMDGPU backend passes that check TargetOptions directly). + options.AllowFPOpFusion = FPOpFusion::Fast; if (this->config_.fast_math) { - options.AllowFPOpFusion = FPOpFusion::Fast; - // UnsafeFPMath was removed in LLVM 22; set the individual flags it implied options.NoInfsFPMath = 1; options.NoNaNsFPMath = 1; options.NoSignedZerosFPMath = 1; options.NoTrappingFPMath = 1; } else { - options.AllowFPOpFusion = FPOpFusion::Strict; options.NoInfsFPMath = 0; options.NoNaNsFPMath = 0; options.NoSignedZerosFPMath = 0; @@ -156,6 +217,16 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco( // Run the new optimization pipeline mpm.run(*llvm_module, mam); + // Additional LLVM optimization passes + llvm::legacy::FunctionPassManager extra_fpm(llvm_module.get()); + extra_fpm.add(llvm::createLoopStrengthReducePass()); + extra_fpm.add(llvm::createSeparateConstOffsetFromGEPPass(false)); + extra_fpm.add(llvm::createEarlyCSEPass(true)); + extra_fpm.doInitialization(); + for (auto func = llvm_module->begin(); func != llvm_module->end(); ++func) + extra_fpm.run(*func); + extra_fpm.doFinalization(); + // Keep legacy PassManager for backend code generation module_pass_manager.add(llvm::createTargetTransformInfoWrapperPass( machine->getTargetIRAnalysis())); diff --git a/quadrants/runtime/amdgpu/jit_amdgpu.h b/quadrants/runtime/amdgpu/jit_amdgpu.h index eb2a5a67ae..136a9cf68a 100644 --- a/quadrants/runtime/amdgpu/jit_amdgpu.h +++ b/quadrants/runtime/amdgpu/jit_amdgpu.h @@ -2,11 +2,13 @@ #include #include #include +#include #include #include "llvm/ADT/StringRef.h" #include "llvm/Support/DynamicLibrary.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/MD5.h" #include "llvm/Target/TargetMachine.h" #include "llvm/IR/Module.h" #include "llvm/IR/DataLayout.h" @@ -23,6 +25,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" #include "quadrants/rhi/amdgpu/amdgpu_context.h" @@ -137,10 +140,22 @@ class JITSessionAMDGPU : public JITSession { return tmp_dir_; } + std::string compute_module_cache_key(llvm::Module *module) { + std::string bitcode; + llvm::raw_string_ostream sos(bitcode); + llvm::WriteBitcodeToFile(*module, sos); + llvm::MD5 hasher; + hasher.update(bitcode); + llvm::MD5::MD5Result result; + hasher.final(result); + return result.digest().str().str(); + } + private: std::string compile_module_to_hsaco(std::unique_ptr &module); uint64_t random_num_; std::string tmp_dir_; + std::unordered_map hsaco_cache_; }; #endif diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index eac751e37a..fcd6b1334b 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -6,29 +6,26 @@ namespace quadrants::lang { namespace amdgpu { void KernelLauncher::launch_offloaded_tasks( + LaunchContextBuilder &ctx, JITModule *amdgpu_module, - const std::vector &offloaded_tasks, - void *context_pointer, - int arg_size) { + const std::vector &offloaded_tasks) { + constexpr int kRuntimeContextArgSize = sizeof(RuntimeContext); for (const auto &task : offloaded_tasks) { QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, task.block_dim); amdgpu_module->launch(task.name, task.grid_dim, task.block_dim, task.dynamic_shared_array_bytes, - {(void *)&context_pointer}, {arg_size}); + {&ctx.get_context()}, {kRuntimeContextArgSize}); } } void KernelLauncher::launch_offloaded_tasks_with_do_while( LaunchContextBuilder &ctx, JITModule *amdgpu_module, - const std::vector &offloaded_tasks, - void *context_pointer, - int arg_size) { + const std::vector &offloaded_tasks) { int32_t counter_val; do { - launch_offloaded_tasks(amdgpu_module, offloaded_tasks, context_pointer, - arg_size); + launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks); counter_val = 0; AMDGPUDriver::get_instance().stream_synchronize(nullptr); AMDGPUDriver::get_instance().memcpy_device_to_host( @@ -64,17 +61,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, std::unordered_map device_ptrs; char *device_result_buffer{nullptr}; - // Here we have to guarantee the result_result_buffer isn't nullptr - // It is interesting - The code following - // L60: DeviceAllocation devalloc = - // executor->allocate_memory_on_device( call another kernel and it will result - // in - // Memory access fault by GPU node-1 (Agent handle: 0xeda5ca0) on address - // (nil). Reason: Page not present or supervisor privilege. - // if you don't allocate it. - AMDGPUDriver::get_instance().malloc( + AMDGPUDriver::get_instance().malloc_async( (void **)&device_result_buffer, - std::max(ctx.result_buffer_size, sizeof(uint64))); + std::max(ctx.result_buffer_size, sizeof(uint64)), nullptr); for (int i = 0; i < (int)parameters.size(); i++) { const auto &kv = parameters[i]; @@ -132,32 +121,22 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } char *device_arg_buffer = nullptr; if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().malloc((void **)&device_arg_buffer, - ctx.arg_buffer_size); + AMDGPUDriver::get_instance().malloc_async((void **)&device_arg_buffer, + ctx.arg_buffer_size, nullptr); AMDGPUDriver::get_instance().memcpy_host_to_device( device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size); ctx.get_context().arg_buffer = device_arg_buffer; } - void *context_pointer; - int arg_size = sizeof(RuntimeContext *); - AMDGPUDriver::get_instance().malloc((void **)&context_pointer, - sizeof(RuntimeContext)); - AMDGPUDriver::get_instance().memcpy_host_to_device( - context_pointer, &ctx.get_context(), sizeof(RuntimeContext)); - - AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer); if (ctx.graph_do_while_arg_id >= 0) { QD_ASSERT(ctx.graph_do_while_flag_dev_ptr); - launch_offloaded_tasks_with_do_while(ctx, amdgpu_module, offloaded_tasks, - context_pointer, arg_size); + launch_offloaded_tasks_with_do_while(ctx, amdgpu_module, offloaded_tasks); } else { - launch_offloaded_tasks(amdgpu_module, offloaded_tasks, context_pointer, - arg_size); + launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks); } QD_TRACE("Launching kernel"); if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().mem_free(device_arg_buffer); + AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, nullptr); } if (ctx.result_buffer_size > 0) { AMDGPUDriver::get_instance().memcpy_device_to_host( @@ -173,8 +152,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->deallocate_memory_on_device(itr->second.second); } } - // Since we always allocating above then we should always free - AMDGPUDriver::get_instance().mem_free(device_result_buffer); + AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, nullptr); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( diff --git a/quadrants/runtime/amdgpu/kernel_launcher.h b/quadrants/runtime/amdgpu/kernel_launcher.h index be4ca6c255..23051c3ff8 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.h +++ b/quadrants/runtime/amdgpu/kernel_launcher.h @@ -23,16 +23,14 @@ class KernelLauncher : public LLVM::KernelLauncher { const LLVM::CompiledKernelData &compiled) override; private: - void launch_offloaded_tasks(JITModule *amdgpu_module, - const std::vector &offloaded_tasks, - void *context_pointer, - int arg_size); + void launch_offloaded_tasks( + LaunchContextBuilder &ctx, + JITModule *amdgpu_module, + const std::vector &offloaded_tasks); void launch_offloaded_tasks_with_do_while( LaunchContextBuilder &ctx, JITModule *amdgpu_module, - const std::vector &offloaded_tasks, - void *context_pointer, - int arg_size); + const std::vector &offloaded_tasks); bool on_amdgpu_device(void *ptr); std::vector contexts_; }; diff --git a/quadrants/runtime/llvm/llvm_context.cpp b/quadrants/runtime/llvm/llvm_context.cpp index f2660d3e83..e594286f6e 100644 --- a/quadrants/runtime/llvm/llvm_context.cpp +++ b/quadrants/runtime/llvm/llvm_context.cpp @@ -918,8 +918,13 @@ void QuadrantsLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func, } void QuadrantsLLVMContext::mark_function_as_amdgpu_kernel( - llvm::Function *func) { + llvm::Function *func, int block_dim) { func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + if (block_dim > 0) { + std::string size_str = std::to_string(block_dim) + "," + + std::to_string(block_dim); + func->addFnAttr("amdgpu-flat-work-group-size", size_str); + } } void QuadrantsLLVMContext::eliminate_unused_functions( diff --git a/quadrants/runtime/llvm/llvm_context.h b/quadrants/runtime/llvm/llvm_context.h index 5ddf603675..5544d1bab0 100644 --- a/quadrants/runtime/llvm/llvm_context.h +++ b/quadrants/runtime/llvm/llvm_context.h @@ -109,7 +109,7 @@ class QuadrantsLLVMContext { void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0); - void mark_function_as_amdgpu_kernel(llvm::Function *func); + void mark_function_as_amdgpu_kernel(llvm::Function *func, int block_dim = 0); void fetch_this_thread_struct_module(); llvm::Module *get_this_thread_runtime_module(); diff --git a/quadrants/runtime/llvm/llvm_context_pass.h b/quadrants/runtime/llvm/llvm_context_pass.h index e386b07a78..8699b85f23 100644 --- a/quadrants/runtime/llvm/llvm_context_pass.h +++ b/quadrants/runtime/llvm/llvm_context_pass.h @@ -113,6 +113,71 @@ struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass { } }; +// After O3, remaining addrspace(0) loads/stores are either runtime +// metadata accesses (global memory) or alloca-derived accesses that +// InferAddressSpaces couldn't resolve. This pass converts the former +// to addrspace(1) by using stripPointerCasts() to trace pointer +// origins: if the origin is an alloca or addrspacecast-from-5 +// (scratch), the load/store is left as flat. +struct AMDGPUFlatToGlobalLoadStorePass : public FunctionPass { + static inline char ID{0}; + AMDGPUFlatToGlobalLoadStorePass() : FunctionPass(ID) {} + + static bool originatesFromScratch(llvm::Value *ptr) { + auto *origin = ptr->stripPointerCasts(); + if (llvm::isa(origin)) + return true; + if (auto *ASC = llvm::dyn_cast(origin)) + if (ASC->getSrcAddressSpace() == 5) + return true; + if (auto *PHI = llvm::dyn_cast(origin)) { + for (unsigned i = 0; i < PHI->getNumIncomingValues(); ++i) + if (originatesFromScratch(PHI->getIncomingValue(i))) + return true; + } + if (auto *Sel = llvm::dyn_cast(origin)) { + if (originatesFromScratch(Sel->getTrueValue()) || + originatesFromScratch(Sel->getFalseValue())) + return true; + } + return false; + } + + bool runOnFunction(llvm::Function &F) override { + bool modified = false; + auto *ptr_global_ty = llvm::PointerType::get(F.getContext(), 1); + for (auto &BB : F) { + std::vector to_convert; + for (auto &I : BB) { + if (auto *LI = llvm::dyn_cast(&I)) { + if (LI->getPointerAddressSpace() == 0 && + !originatesFromScratch(LI->getPointerOperand())) + to_convert.push_back(LI); + } else if (auto *SI = llvm::dyn_cast(&I)) { + if (SI->getPointerAddressSpace() == 0 && + !originatesFromScratch(SI->getPointerOperand())) + to_convert.push_back(SI); + } + } + for (auto *I : to_convert) { + llvm::IRBuilder<> B(I); + if (auto *LI = llvm::dyn_cast(I)) { + auto *cast = B.CreateAddrSpaceCast(LI->getPointerOperand(), + ptr_global_ty); + LI->setOperand(LI->getPointerOperandIndex(), cast); + modified = true; + } else if (auto *SI = llvm::dyn_cast(I)) { + auto *cast = B.CreateAddrSpaceCast(SI->getPointerOperand(), + ptr_global_ty); + SI->setOperand(SI->getPointerOperandIndex(), cast); + modified = true; + } + } + } + return modified; + } +}; + struct AMDGPUAddStructForFuncPass : public ModulePass { static inline char ID{0}; std::string func_name_; diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 81c66f4418..889bd2bd0c 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -68,6 +68,8 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, config.arch = host_arch(); } else { // AMDGPU runtime created successfully + use_device_memory_pool_ = + AMDGPUContext::get_instance().supports_mem_pool(); } #else QD_WARN("Quadrants is not compiled with AMDGPU."); @@ -146,7 +148,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, config.max_block_dim = query_max_block_dim; } if (config.saturating_grid_dim == 0) { - config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2; + config.saturating_grid_dim = num_workgroups * query_max_block_per_cu; } if (config.kernel_profiler) { AMDGPUContext::get_instance().set_profiler(profiler);