From 5a4ee2c128dd665e3b2fe3dd7c9147f0a06a41b7 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 21:30:30 -0400 Subject: [PATCH 01/44] added initial llvm codegen for amdgpu --- python/tvm/_ffi/ndarray.py | 2 + src/codegen/llvm/codegen_amdgpu.cc | 171 +++++++++++++++++++ src/runtime/module.cc | 4 +- tests/python/integration/test_gemm.py | 5 + tests/python/unittest/test_codegen_device.py | 1 + 5 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 src/codegen/llvm/codegen_amdgpu.cc diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py index eb440bf06635..b7bac46157d2 100644 --- a/python/tvm/_ffi/ndarray.py +++ b/python/tvm/_ffi/ndarray.py @@ -59,6 +59,8 @@ def context(dev_type, dev_id=0): if dev_type not in TVMContext.STR2MASK: if dev_type.find("nvptx") != -1: dev_type = "cuda" + if dev_type.find("rocm") != -1: + dev_type = "rocm" if dev_type not in TVMContext.STR2MASK: raise ValueError("Unknown device type %s" % dev_type) dev_type = TVMContext.STR2MASK[dev_type] diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc new file mode 100644 index 000000000000..06d90c3ea5fb --- /dev/null +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -0,0 +1,171 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file codegen_nvptx.cc + * \brief NVPTX code generator. + */ +#ifdef TVM_LLVM_VERSION +#if TVM_ROCM_RUNTIME + +#include +#include "./codegen_llvm.h" +#include "../build_common.h" +#include "../../pass/ir_util.h" +#include "../../runtime/rocm/rocm_module.h" + +namespace tvm { +namespace codegen { + +// NVPTX code generator. +class CodeGenAMDGPU : public CodeGenLLVM { + public: + void AddFunction(const LoweredFunc& f) final { + // add function as void return value + CodeGenLLVM::AddFunctionInternal(f, true); + // annotate as kernel function + module_->getOrInsertNamedMetadata("nvvm.annotations") + ->addOperand(llvm::MDNode::get(*ctx_, { + llvm::ValueAsMetadata::get(function_), + llvm::MDString::get(*ctx_, "kernel"), + llvm::ValueAsMetadata::get(ConstInt32(1)) })); + } + + void VisitStmt_(const Allocate* op) final { + CHECK(!is_zero(op->condition)); + llvm::Value* buf = nullptr; + if (op->new_expr.defined()) { + CHECK_EQ(op->free_function, "nop"); + buf = MakeValue(op->new_expr); + } else { + int32_t constant_size = op->constant_allocation_size(); + CHECK_GT(constant_size, 0) + << "Can only handle constant size stack allocation in GPU"; + StorageInfo& info = alloc_storage_info_[op->buffer_var.get()]; + if (constant_size % 4 == 0 && info.alignment == 0) { + info.alignment = GetTempAllocaAlignment(op->type, constant_size); + } + // maximum necessary alignment in the NV devices + if (info.alignment > 16) { + info.alignment = 16; + } + if (info.scope.rank == 2) { + // const int local_address_space = 5; + // TODO(tqchen): for higher version of LLVM, local address space can be set. + llvm::AllocaInst* alloca = builder_->CreateAlloca( + LLVMType(op->type), ConstInt32(constant_size)); + if (alloca->getAlignment() < static_cast(info.alignment)) { + alloca->setAlignment(info.alignment); + } + buf = alloca; + } else { + CHECK_EQ(info.scope.rank, 1) + << "Can only allocate shared or local memory inside kernel"; + // Shared memory: address space == 3 + const unsigned shared_address_space = 3; + llvm::Type* type = llvm::ArrayType::get(LLVMType(op->type), constant_size); + // Allocate shared memory in global, address_space = 3 + llvm::GlobalVariable *global = new llvm::GlobalVariable( + *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared", + nullptr, llvm::GlobalValue::NotThreadLocal, shared_address_space); + global->setAlignment(info.alignment); + buf = global; + } + } + buf = builder_->CreatePointerCast( + buf, LLVMType(op->type)->getPointerTo( + buf->getType()->getPointerAddressSpace())); + CHECK(!var_map_.count(op->buffer_var.get())); + var_map_[op->buffer_var.get()] = buf; + this->VisitStmt(op->body); + } + + // Return the thread index via intrinsics. + llvm::Value* GetThreadIndex(const IterVar& iv) final { + runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag); + llvm::Intrinsic::ID intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x; + if (ts.rank == 1) { + switch (ts.dim_index) { + case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x; break; + case 1: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_y; break; + case 2: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_z; break; + default: LOG(FATAL) << "unknown thread idx"; + } + } + /* + else { + CHECK_EQ(ts.rank, 0); + switch (ts.dim_index) { + case 0: intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x; break; + case 1: intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_y; break; + case 2: intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_z; break; + default: LOG(FATAL) << "unknown thread idx"; + } + } + */ + llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id); + return builder_->CreateCall(f, {}); + } + + llvm::Value* CreateStorageSync(const Call* op) final { + const std::string& sync = op->args[0].as()->value; + if (sync == "warp") { + // TODO(tqchen) warp sync in CUDA9 + return nullptr; + } else if (sync == "shared") { + llvm::Function* f = llvm::Intrinsic::getDeclaration( + module_.get(), + ::llvm::Intrinsic::amdgcn_s_barrier); + return builder_->CreateCall(f, {}); + } else { + LOG(FATAL) << "Do not support sync " << sync; + return nullptr; + } + } + + void InitPassManagerBuilder(llvm::PassManagerBuilder* builder) final { + // Additional optimization hook to tweak the builder. + } + + protected: + void InitTarget(llvm::TargetMachine* tm) final { + // Maximum vector lane = float4 + native_vector_bits_ = 4 * 32; + CodeGenLLVM::InitTarget(tm); + } +}; + +runtime::Module BuildAMDGPU(Array funcs, std::string target) { + CHECK(1) << target; + CHECK(target.length( +) >= 5 && + target.substr(0, 5) == "rocm"); + llvm::TargetMachine* tm = GetLLVMTargetMachine( + " -mcpu=gfx900" + + target.substr(5, target.length() - 5)); + std::unique_ptr cg(new CodeGenAMDGPU()); + std::unique_ptr ctx(new llvm::LLVMContext()); + cg->Init(funcs[0]->name, tm, ctx.get(), false, false); + for (LoweredFunc f : funcs) { + cg->AddFunction(f); + } + std::unique_ptr module = cg->Finish(); + llvm::SmallString<8> data; + llvm::raw_svector_ostream dest(data); + dest.SetUnbuffered(); + llvm::legacy::PassManager pass; + CHECK(tm->addPassesToEmitFile( + pass, dest, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + << "Cannot emit target CGFT_ObjectFile"; + pass.run(*module); + std::string hsaco(data.begin(), data.end()); + return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ""); +} + +TVM_REGISTER_API("codegen.build_amdgcn") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildAMDGPU(args[0], args[1]); + }); + +} // namespace codegen +} // namespace tvm +#endif // TVM_CUDA_RUNTIME +#endif // TVM_LLVM_VERSION diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 2f2b0a214bed..4ee12053a1f3 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -125,7 +125,9 @@ bool RuntimeEnabled(const std::string& target) { f_name = "device_api.vpi"; } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") { f_name = "codegen.build_nvptx"; - } else if (target.length() >= 4 && target.substr(0, 4) == "llvm") { + } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") { + f_name = "codegen.build_rocm"; + }else if (target.length() >= 4 && target.substr(0, 4) == "llvm") { const PackedFunc* pf = runtime::Registry::Get("codegen.llvm_target_enabled"); if (pf == nullptr) return false; return (*pf)(target); diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py index 0798ecf61e4f..459240f2c499 100644 --- a/tests/python/integration/test_gemm.py +++ b/tests/python/integration/test_gemm.py @@ -85,6 +85,11 @@ def check_device(device): np.testing.assert_allclose( c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) +<<<<<<< a45d3b01f7900010a9694c2b606dad22ddbe1768 +======= + check_device("nvptx -mcpu=sm_20") + check_device("rocm") +>>>>>>> added initial llvm codegen for amdgpu check_device("metal") check_device("opencl") check_device("cuda") diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py index ee3284cd640c..c4fbb4eccac1 100644 --- a/tests/python/unittest/test_codegen_device.py +++ b/tests/python/unittest/test_codegen_device.py @@ -82,6 +82,7 @@ def check_module_save(device, host="stackvm"): check_target("cuda", host="llvm") check_module_save("cuda", host="stackvm") check_target("nvptx", host="llvm") + check_target("rocm", host="llvm") if __name__ == "__main__": test_add_pipeline() From 0a5270da2a78c10374c11ee589b9e635b056a1ec Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 21:36:27 -0400 Subject: [PATCH 02/44] fixed whitespace --- src/runtime/module.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 4ee12053a1f3..43ad6e523494 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -127,7 +127,7 @@ bool RuntimeEnabled(const std::string& target) { f_name = "codegen.build_nvptx"; } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") { f_name = "codegen.build_rocm"; - }else if (target.length() >= 4 && target.substr(0, 4) == "llvm") { + } else if (target.length() >= 4 && target.substr(0, 4) == "llvm") { const PackedFunc* pf = runtime::Registry::Get("codegen.llvm_target_enabled"); if (pf == nullptr) return false; return (*pf)(target); From 01a5c93e6ffad7d8d9caf44fb68f5d5141cad4f2 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 22:31:49 -0400 Subject: [PATCH 03/44] fixed hsaco gen from ir --- src/codegen/llvm/codegen_amdgpu.cc | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 06d90c3ea5fb..27a5e0384457 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -22,11 +22,13 @@ class CodeGenAMDGPU : public CodeGenLLVM { // add function as void return value CodeGenLLVM::AddFunctionInternal(f, true); // annotate as kernel function +/* module_->getOrInsertNamedMetadata("nvvm.annotations") ->addOperand(llvm::MDNode::get(*ctx_, { llvm::ValueAsMetadata::get(function_), llvm::MDString::get(*ctx_, "kernel"), llvm::ValueAsMetadata::get(ConstInt32(1)) })); +*/ } void VisitStmt_(const Allocate* op) final { @@ -81,7 +83,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { // Return the thread index via intrinsics. llvm::Value* GetThreadIndex(const IterVar& iv) final { runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag); - llvm::Intrinsic::ID intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x; + llvm::Intrinsic::ID intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x; if (ts.rank == 1) { switch (ts.dim_index) { case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x; break; @@ -136,11 +138,15 @@ class CodeGenAMDGPU : public CodeGenLLVM { runtime::Module BuildAMDGPU(Array funcs, std::string target) { CHECK(1) << target; CHECK(target.length( -) >= 5 && - target.substr(0, 5) == "rocm"); - llvm::TargetMachine* tm = GetLLVMTargetMachine( - " -mcpu=gfx900" + - target.substr(5, target.length() - 5)); +) >= 4 && + target.substr(0, 4) == "rocm"); + std::string triple("amdgcn-amd-amdhsa-hcc"); + std::string error; + auto llvmTarget = llvm::TargetRegistry::lookupTarget(triple, error); + auto features = ""; + llvm::TargetOptions opt; + auto RM = llvm::Optional(); + llvm::TargetMachine* tm = llvmTarget->createTargetMachine(triple, "gfx900", features, opt, RM); std::unique_ptr cg(new CodeGenAMDGPU()); std::unique_ptr ctx(new llvm::LLVMContext()); cg->Init(funcs[0]->name, tm, ctx.get(), false, false); @@ -160,7 +166,7 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ""); } -TVM_REGISTER_API("codegen.build_amdgcn") +TVM_REGISTER_API("codegen.build_rocm") .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = BuildAMDGPU(args[0], args[1]); }); From c56287333108484358c36120c852ae43a6b78f75 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 23:08:24 -0400 Subject: [PATCH 04/44] fixed targetmachine for rocm and added GetSource for rocm --- src/codegen/llvm/codegen_amdgpu.cc | 8 +------- src/runtime/rocm/rocm_module.cc | 6 ++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 27a5e0384457..cb4303f8d989 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -140,13 +140,7 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { CHECK(target.length( ) >= 4 && target.substr(0, 4) == "rocm"); - std::string triple("amdgcn-amd-amdhsa-hcc"); - std::string error; - auto llvmTarget = llvm::TargetRegistry::lookupTarget(triple, error); - auto features = ""; - llvm::TargetOptions opt; - auto RM = llvm::Optional(); - llvm::TargetMachine* tm = llvmTarget->createTargetMachine(triple, "gfx900", features, opt, RM); + llvm::TargetMachine* tm = GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx900" + target.substr(4, target.length() - 4)); std::unique_ptr cg(new CodeGenAMDGPU()); std::unique_ptr ctx(new llvm::LLVMContext()); cg->Init(funcs[0]->name, tm, ctx.get(), false, false); diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc index f7ce1f284ee2..7700a45333f7 100644 --- a/src/runtime/rocm/rocm_module.cc +++ b/src/runtime/rocm/rocm_module.cc @@ -59,6 +59,12 @@ class ROCMModuleNode : public runtime::ModuleNode { stream->Write(data_); } + std::string GetSource(const std::string& format) final { + if(format == fmt_) return data_; + if(fmt_ == "hsaco") LOG(WARNING)<<"HSACO"; return data_; + return ""; + } + // get a CUfunction from primary context in device_id hipFunction_t GetFunc(int device_id, const std::string& func_name) { std::lock_guard lock(mutex_); From 8ed9f9ab0032f7e4631e375df24e25bdae0c5368 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 23:14:21 -0400 Subject: [PATCH 05/44] fixed whitespace issues --- src/runtime/rocm/rocm_module.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc index 7700a45333f7..86793993e832 100644 --- a/src/runtime/rocm/rocm_module.cc +++ b/src/runtime/rocm/rocm_module.cc @@ -60,8 +60,8 @@ class ROCMModuleNode : public runtime::ModuleNode { } std::string GetSource(const std::string& format) final { - if(format == fmt_) return data_; - if(fmt_ == "hsaco") LOG(WARNING)<<"HSACO"; return data_; + if (format == fmt_) { return data_; } + if (fmt_ == "hsaco") { return data_; } return ""; } From 4700efc61648a4c7bcb26019c81a9b3ab197c947 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 23:16:14 -0400 Subject: [PATCH 06/44] changed statement to use less than 100 lines --- src/codegen/llvm/codegen_amdgpu.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index cb4303f8d989..408247b2ee72 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -140,7 +140,9 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { CHECK(target.length( ) >= 4 && target.substr(0, 4) == "rocm"); - llvm::TargetMachine* tm = GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx900" + target.substr(4, target.length() - 4)); + llvm::TargetMachine* tm = \ + GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx900" + \ + target.substr(4, target.length() - 4)); std::unique_ptr cg(new CodeGenAMDGPU()); std::unique_ptr ctx(new llvm::LLVMContext()); cg->Init(funcs[0]->name, tm, ctx.get(), false, false); From 230194e2505719a359ed5a884f9968f771f8fdbf Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 23:42:48 -0400 Subject: [PATCH 07/44] added intrinsics for workgroup - rocm --- src/codegen/llvm/codegen_amdgpu.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 408247b2ee72..635d858db048 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -92,17 +92,15 @@ class CodeGenAMDGPU : public CodeGenLLVM { default: LOG(FATAL) << "unknown thread idx"; } } - /* else { CHECK_EQ(ts.rank, 0); switch (ts.dim_index) { - case 0: intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x; break; - case 1: intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_y; break; - case 2: intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_z; break; + case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_x; break; + case 1: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_y; break; + case 2: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_z; break; default: LOG(FATAL) << "unknown thread idx"; } } - */ llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id); return builder_->CreateCall(f, {}); } From b1d54f699660a1c6faa5ba7bd87e16f76a4bcbf5 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 23:44:05 -0400 Subject: [PATCH 08/44] whitespace - newline error fix --- src/codegen/llvm/codegen_amdgpu.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 635d858db048..01737ca1fc37 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -91,8 +91,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { case 2: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_z; break; default: LOG(FATAL) << "unknown thread idx"; } - } - else { + } else { CHECK_EQ(ts.rank, 0); switch (ts.dim_index) { case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_x; break; From 84091ffe65da68a6005e0b88ab5a9c86d9107a37 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 30 Aug 2017 23:49:30 -0400 Subject: [PATCH 09/44] fixed error msg for workitem-workgroup intrinsics --- src/codegen/llvm/codegen_amdgpu.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 01737ca1fc37..8a23cf7ae53b 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -89,7 +89,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x; break; case 1: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_y; break; case 2: intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_z; break; - default: LOG(FATAL) << "unknown thread idx"; + default: LOG(FATAL) << "unknown workitem idx"; } } else { CHECK_EQ(ts.rank, 0); @@ -97,7 +97,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { case 0: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_x; break; case 1: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_y; break; case 2: intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_z; break; - default: LOG(FATAL) << "unknown thread idx"; + default: LOG(FATAL) << "unknown workgroup idx"; } } llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id); From bcf423d0829cce3e04095f1950bf7532f0e6f7fb Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 31 Aug 2017 12:26:13 -0400 Subject: [PATCH 10/44] added llvm ir dump for rocm codegen --- src/codegen/llvm/codegen_amdgpu.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 8a23cf7ae53b..39f63fc2091c 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -147,16 +147,19 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { cg->AddFunction(f); } std::unique_ptr module = cg->Finish(); - llvm::SmallString<8> data; - llvm::raw_svector_ostream dest(data); - dest.SetUnbuffered(); + llvm::SmallString<8> data_hsaco, data_ll; + llvm::raw_svector_ostream dest_hsaco(data_hsaco), dest_ll(data_ll); + dest_hsaco.SetUnbuffered(); + dest_ll.SetUnbuffered(); llvm::legacy::PassManager pass; CHECK(tm->addPassesToEmitFile( - pass, dest, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + pass, dest_hsaco, llvm::TargetMachine::CGFT_AssemblyFile) == 0) << "Cannot emit target CGFT_ObjectFile"; pass.run(*module); - std::string hsaco(data.begin(), data.end()); - return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ""); + module->print(dest_ll, nullptr); + std::string hsaco(data_hsaco.begin(), data_hsaco.end()); + std::string ll(data_ll.begin(), data_ll.end()); + return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); } TVM_REGISTER_API("codegen.build_rocm") From cf947c6c8a912ea9a645ae2fff11d397bc3d38d3 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 1 Sep 2017 12:04:45 -0400 Subject: [PATCH 11/44] [ROCM] changed codegen to emit proper amdgpu kernel header --- src/codegen/llvm/codegen_amdgpu.cc | 6 +++--- src/codegen/llvm/codegen_llvm.cc | 6 +++--- src/codegen/llvm/codegen_llvm.h | 10 +++++++++- tests/python/integration/test_gemm.py | 3 --- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 39f63fc2091c..caf370662261 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -15,12 +15,12 @@ namespace tvm { namespace codegen { -// NVPTX code generator. +// AMDGPU code generator. class CodeGenAMDGPU : public CodeGenLLVM { public: void AddFunction(const LoweredFunc& f) final { // add function as void return value - CodeGenLLVM::AddFunctionInternal(f, true); + CodeGenLLVM::AddFunctionInternal(f, true, AMDGPU); // annotate as kernel function /* module_->getOrInsertNamedMetadata("nvvm.annotations") @@ -169,5 +169,5 @@ TVM_REGISTER_API("codegen.build_rocm") } // namespace codegen } // namespace tvm -#endif // TVM_CUDA_RUNTIME +#endif // TVM_ROCM_RUNTIME #endif // TVM_LLVM_VERSION diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 30437bb911d6..a3652f835cc8 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -90,7 +90,7 @@ void CodeGenLLVM::AddFunction(const LoweredFunc& f) { AddFunctionInternal(f, false); } -void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) { +void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDeviceType dev_type) { this->InitFuncState(); is_restricted_ = f->is_restricted; CHECK(!module_->getFunction(f->name)) @@ -100,7 +100,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) { Type t = arg.type(); if (t.is_handle() && f->handle_data_type.count(arg)) { arg_type.push_back( - LLVMType(f->handle_data_type[arg].type())->getPointerTo()); + LLVMType(f->handle_data_type[arg].type())->getPointerTo(dev_type == AMDGPU ? 1 : 0)); if (!is_restricted_) { alias_var_set_.insert(arg.get()); } @@ -113,7 +113,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) { ret_void ? t_void_ : t_int_, arg_type, false); // setup the function. function_ = llvm::cast(module_->getOrInsertFunction(f->name, ftype)); - function_->setCallingConv(llvm::CallingConv::C); + function_->setCallingConv(dev_type == AMDGPU ? llvm::CallingConv::AMDGPU_KERNEL : llvm::CallingConv::C); // set handle argument to be non alias. if (is_restricted_) { for (size_t i = 0; i < f->args.size(); ++i) { diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h index 7fa10c05ad4a..d7a8b1b1c8f2 100644 --- a/src/codegen/llvm/codegen_llvm.h +++ b/src/codegen/llvm/codegen_llvm.h @@ -23,6 +23,14 @@ namespace codegen { using namespace ir; +/*! + * \brief select codegen path for a device + */ +enum CGDeviceType { + OTHERS = 0, + AMDGPU, +}; + /*! * \brief A base class to generate a LLVM. */ @@ -148,7 +156,7 @@ class CodeGenLLVM : virtual void Optimize(); // Get the maximim storage align bits of buffer pointer given storage scope. virtual int NativeVectorBits(const runtime::StorageScope& storage_scope) const; - void AddFunctionInternal(const LoweredFunc& f, bool ret_void); + void AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDeviceType dev_type = OTHERS); // Create extern call llvm::CallInst* CreateCallExtern(llvm::Type* ret, const std::string& name, diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py index 459240f2c499..5733e11a84ec 100644 --- a/tests/python/integration/test_gemm.py +++ b/tests/python/integration/test_gemm.py @@ -85,11 +85,8 @@ def check_device(device): np.testing.assert_allclose( c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) -<<<<<<< a45d3b01f7900010a9694c2b606dad22ddbe1768 -======= check_device("nvptx -mcpu=sm_20") check_device("rocm") ->>>>>>> added initial llvm codegen for amdgpu check_device("metal") check_device("opencl") check_device("cuda") From 8c61580205ff349e585823cd937f1cb508e12963 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 1 Sep 2017 12:19:56 -0400 Subject: [PATCH 12/44] fixed whitespace error --- src/codegen/llvm/codegen_llvm.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index a3652f835cc8..5ca678174198 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -113,7 +113,8 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDev ret_void ? t_void_ : t_int_, arg_type, false); // setup the function. function_ = llvm::cast(module_->getOrInsertFunction(f->name, ftype)); - function_->setCallingConv(dev_type == AMDGPU ? llvm::CallingConv::AMDGPU_KERNEL : llvm::CallingConv::C); + function_->setCallingConv(dev_type == AMDGPU ? \ + llvm::CallingConv::AMDGPU_KERNEL : llvm::CallingConv::C); // set handle argument to be non alias. if (is_restricted_) { for (size_t i = 0; i < f->args.size(); ++i) { From 14384a1c021c7a9d081516110095d3075ee593c6 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 1 Sep 2017 12:22:10 -0400 Subject: [PATCH 13/44] fixed whitespace error- 2 --- src/codegen/llvm/codegen_llvm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 5ca678174198..bcb5f711aced 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -113,7 +113,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDev ret_void ? t_void_ : t_int_, arg_type, false); // setup the function. function_ = llvm::cast(module_->getOrInsertFunction(f->name, ftype)); - function_->setCallingConv(dev_type == AMDGPU ? \ + function_->setCallingConv(dev_type == AMDGPU ? llvm::CallingConv::AMDGPU_KERNEL : llvm::CallingConv::C); // set handle argument to be non alias. if (is_restricted_) { From 7bd23f462081ad9fbeab74be657a5513316ff06d Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 5 Sep 2017 13:23:38 -0400 Subject: [PATCH 14/44] fixed AddFunction to not to use extra arg 1. Changed AddFunctionInternal to not to take extra arg for target type 2. Use Target from CodeGenLLVM to check for AMDGPU target --- src/codegen/llvm/codegen_amdgpu.cc | 2 +- src/codegen/llvm/codegen_llvm.cc | 57 ++++++++++++++++++++++-------- src/codegen/llvm/codegen_llvm.h | 9 +---- src/codegen/llvm/llvm_common.cc | 1 + src/runtime/rocm/rocm_module.cc | 1 + 5 files changed, 47 insertions(+), 23 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index caf370662261..0d38d686e604 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -20,7 +20,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { public: void AddFunction(const LoweredFunc& f) final { // add function as void return value - CodeGenLLVM::AddFunctionInternal(f, true, AMDGPU); + CodeGenLLVM::AddFunctionInternal(f, true); // annotate as kernel function /* module_->getOrInsertNamedMetadata("nvvm.annotations") diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index bcb5f711aced..96ed7737d1ae 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -77,6 +77,9 @@ void CodeGenLLVM::InitTarget(llvm::TargetMachine* tm) { << " for target " << target; } } + if (target == "amdgcn") { + LOG(WARNING)<< target; + } } void CodeGenLLVM::InitFuncState() { @@ -90,7 +93,8 @@ void CodeGenLLVM::AddFunction(const LoweredFunc& f) { AddFunctionInternal(f, false); } -void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDeviceType dev_type) { +void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) { + bool isTargetAMD = target_machine_->getTarget().getName() == std::string("amdgcn"); this->InitFuncState(); is_restricted_ = f->is_restricted; CHECK(!module_->getFunction(f->name)) @@ -100,7 +104,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDev Type t = arg.type(); if (t.is_handle() && f->handle_data_type.count(arg)) { arg_type.push_back( - LLVMType(f->handle_data_type[arg].type())->getPointerTo(dev_type == AMDGPU ? 1 : 0)); + LLVMType(f->handle_data_type[arg].type())->getPointerTo(isTargetAMD ? 1 : 0)); if (!is_restricted_) { alias_var_set_.insert(arg.get()); } @@ -113,7 +117,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDev ret_void ? t_void_ : t_int_, arg_type, false); // setup the function. function_ = llvm::cast(module_->getOrInsertFunction(f->name, ftype)); - function_->setCallingConv(dev_type == AMDGPU ? + function_->setCallingConv(isTargetAMD ? llvm::CallingConv::AMDGPU_KERNEL : llvm::CallingConv::C); // set handle argument to be non alias. if (is_restricted_) { @@ -164,7 +168,9 @@ class MPassManager : public llvm::legacy::PassManager { public: // override add to allow messaging void add(llvm::Pass* p) final { - llvm::legacy::PassManager::add(p); + if (std::string(p->getPassName()) != "amdgcn") { + llvm::legacy::PassManager::add(p); + } } }; @@ -189,18 +195,34 @@ void CodeGenLLVM::Optimize() { target_machine_->adjustPassManager(builder); #endif - // pass manager - FPassManager fpass(module_.get()); - MPassManager mpass; - builder.populateFunctionPassManager(fpass); - builder.populateModulePassManager(mpass); + if(target_machine_->getTarget().getName() == std::string("amdgcn")) { - fpass.doInitialization(); - for (auto it = module_->begin(); it != module_->end(); ++it) { - fpass.run(*it); + llvm::legacy::FunctionPassManager amdgcnFPM(module_.get()); + llvm::legacy::PassManager amdgcnMPM; + builder.populateFunctionPassManager(amdgcnFPM); + builder.populateModulePassManager(amdgcnMPM); + + amdgcnFPM.doInitialization(); + for (auto it = module_->begin(); it != module_->end(); ++it) { + amdgcnFPM.run(*it); + } + amdgcnFPM.doFinalization(); + amdgcnMPM.run(*module_); + + } else { + // pass manager + FPassManager fpass(module_.get()); + MPassManager mpass; + builder.populateFunctionPassManager(fpass); + builder.populateModulePassManager(mpass); + + fpass.doInitialization(); + for (auto it = module_->begin(); it != module_->end(); ++it) { + fpass.run(*it); + } + fpass.doFinalization(); + mpass.run(*module_); } - fpass.doFinalization(); - mpass.run(*module_); } std::unique_ptr CodeGenLLVM::Finish() { @@ -1057,6 +1079,13 @@ void CodeGenLLVM::VisitStmt_(const For* op) { } void CodeGenLLVM::VisitStmt_(const IfThenElse* op) { + LOG(WARNING) << "VisitStmt_ IfThenElse"; + llvm::SmallString<8> ll; + llvm::raw_svector_ostream dest(ll); + dest.SetUnbuffered(); + module_->print(dest, nullptr); + std::string str(ll.begin(), ll.end()); +// LOG(WARNING) << str; using llvm::BasicBlock; BasicBlock* then_block = BasicBlock::Create( *ctx_, "if_then", function_); diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h index d7a8b1b1c8f2..3998217a4fed 100644 --- a/src/codegen/llvm/codegen_llvm.h +++ b/src/codegen/llvm/codegen_llvm.h @@ -23,13 +23,6 @@ namespace codegen { using namespace ir; -/*! - * \brief select codegen path for a device - */ -enum CGDeviceType { - OTHERS = 0, - AMDGPU, -}; /*! * \brief A base class to generate a LLVM. @@ -156,7 +149,7 @@ class CodeGenLLVM : virtual void Optimize(); // Get the maximim storage align bits of buffer pointer given storage scope. virtual int NativeVectorBits(const runtime::StorageScope& storage_scope) const; - void AddFunctionInternal(const LoweredFunc& f, bool ret_void, CGDeviceType dev_type = OTHERS); + void AddFunctionInternal(const LoweredFunc& f, bool ret_void); // Create extern call llvm::CallInst* CreateCallExtern(llvm::Type* ret, const std::string& name, diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index 05972c3bcfe7..065fd0a54499 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -112,6 +112,7 @@ GetLLVMTargetMachine(const std::string& target_str, } else { opt.FloatABIType = llvm::FloatABI::Hard; } + LOG(WARNING)<< "CPU: "<createTargetMachine( target_triple, cpu, attr, opt, llvm::Reloc::PIC_); return tm; diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc index 86793993e832..7279d30554d4 100644 --- a/src/runtime/rocm/rocm_module.cc +++ b/src/runtime/rocm/rocm_module.cc @@ -203,6 +203,7 @@ Module ROCMModuleCreate( std::string hip_source) { std::shared_ptr n = std::make_shared(data, fmt, fmap, hip_source); + LOG(WARNING) << hip_source; return Module(n); } From 9860610763042d5f5afdc56b2fbf5fa3511de3d1 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 5 Sep 2017 14:18:34 -0400 Subject: [PATCH 15/44] fixed whitespaces --- src/codegen/llvm/codegen_llvm.cc | 3 +-- src/codegen/llvm/llvm_common.cc | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 96ed7737d1ae..dd2afc7f5be5 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -195,8 +195,7 @@ void CodeGenLLVM::Optimize() { target_machine_->adjustPassManager(builder); #endif - if(target_machine_->getTarget().getName() == std::string("amdgcn")) { - + if (target_machine_->getTarget().getName() == std::string("amdgcn")) { llvm::legacy::FunctionPassManager amdgcnFPM(module_.get()); llvm::legacy::PassManager amdgcnMPM; builder.populateFunctionPassManager(amdgcnFPM); diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index 065fd0a54499..38921fcd379c 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -112,7 +112,7 @@ GetLLVMTargetMachine(const std::string& target_str, } else { opt.FloatABIType = llvm::FloatABI::Hard; } - LOG(WARNING)<< "CPU: "<createTargetMachine( target_triple, cpu, attr, opt, llvm::Reloc::PIC_); return tm; From 4ba40cb3ba23cceb3e6e724d23d35e60e76f6071 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 5 Sep 2017 14:35:09 -0400 Subject: [PATCH 16/44] fixed whitespaces 2 --- src/codegen/llvm/llvm_common.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index 38921fcd379c..c6dce7233fab 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -112,7 +112,7 @@ GetLLVMTargetMachine(const std::string& target_str, } else { opt.FloatABIType = llvm::FloatABI::Hard; } - LOG(WARNING) << "CPU: "<createTargetMachine( target_triple, cpu, attr, opt, llvm::Reloc::PIC_); return tm; From 0ecb1e243d846e10ef976f1953215769305b77fd Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 6 Sep 2017 16:00:20 -0400 Subject: [PATCH 17/44] fixed codegen for AMDGPU - now generating valid IR --- src/codegen/llvm/codegen_amdgpu.cc | 7 +++-- src/codegen/llvm/codegen_llvm.cc | 49 +++++++----------------------- src/codegen/llvm/llvm_common.cc | 1 - 3 files changed, 16 insertions(+), 41 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 0d38d686e604..b90e94a65082 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -138,7 +138,7 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { ) >= 4 && target.substr(0, 4) == "rocm"); llvm::TargetMachine* tm = \ - GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx900" + \ + GetLLVMTargetMachine("-mtriple=amdgcn--amdhsa -mcpu=gfx900" + \ target.substr(4, target.length() - 4)); std::unique_ptr cg(new CodeGenAMDGPU()); std::unique_ptr ctx(new llvm::LLVMContext()); @@ -151,12 +151,15 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { llvm::raw_svector_ostream dest_hsaco(data_hsaco), dest_ll(data_ll); dest_hsaco.SetUnbuffered(); dest_ll.SetUnbuffered(); + module->print(dest_ll, nullptr); + std::string printdest_ll(data_ll.begin(), data_ll.end()); + llvm::legacy::PassManager pass; CHECK(tm->addPassesToEmitFile( pass, dest_hsaco, llvm::TargetMachine::CGFT_AssemblyFile) == 0) << "Cannot emit target CGFT_ObjectFile"; pass.run(*module); - module->print(dest_ll, nullptr); + std::string hsaco(data_hsaco.begin(), data_hsaco.end()); std::string ll(data_ll.begin(), data_ll.end()); return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index dd2afc7f5be5..d04f9f5f9d6c 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -77,9 +77,6 @@ void CodeGenLLVM::InitTarget(llvm::TargetMachine* tm) { << " for target " << target; } } - if (target == "amdgcn") { - LOG(WARNING)<< target; - } } void CodeGenLLVM::InitFuncState() { @@ -168,9 +165,7 @@ class MPassManager : public llvm::legacy::PassManager { public: // override add to allow messaging void add(llvm::Pass* p) final { - if (std::string(p->getPassName()) != "amdgcn") { - llvm::legacy::PassManager::add(p); - } + llvm::legacy::PassManager::add(p); } }; @@ -195,33 +190,18 @@ void CodeGenLLVM::Optimize() { target_machine_->adjustPassManager(builder); #endif - if (target_machine_->getTarget().getName() == std::string("amdgcn")) { - llvm::legacy::FunctionPassManager amdgcnFPM(module_.get()); - llvm::legacy::PassManager amdgcnMPM; - builder.populateFunctionPassManager(amdgcnFPM); - builder.populateModulePassManager(amdgcnMPM); - - amdgcnFPM.doInitialization(); - for (auto it = module_->begin(); it != module_->end(); ++it) { - amdgcnFPM.run(*it); - } - amdgcnFPM.doFinalization(); - amdgcnMPM.run(*module_); + // pass manager + FPassManager fpass(module_.get()); + MPassManager mpass; + builder.populateFunctionPassManager(fpass); + builder.populateModulePassManager(mpass); - } else { - // pass manager - FPassManager fpass(module_.get()); - MPassManager mpass; - builder.populateFunctionPassManager(fpass); - builder.populateModulePassManager(mpass); - - fpass.doInitialization(); - for (auto it = module_->begin(); it != module_->end(); ++it) { - fpass.run(*it); - } - fpass.doFinalization(); - mpass.run(*module_); + fpass.doInitialization(); + for (auto it = module_->begin(); it != module_->end(); ++it) { + fpass.run(*it); } + fpass.doFinalization(); + mpass.run(*module_); } std::unique_ptr CodeGenLLVM::Finish() { @@ -1078,13 +1058,6 @@ void CodeGenLLVM::VisitStmt_(const For* op) { } void CodeGenLLVM::VisitStmt_(const IfThenElse* op) { - LOG(WARNING) << "VisitStmt_ IfThenElse"; - llvm::SmallString<8> ll; - llvm::raw_svector_ostream dest(ll); - dest.SetUnbuffered(); - module_->print(dest, nullptr); - std::string str(ll.begin(), ll.end()); -// LOG(WARNING) << str; using llvm::BasicBlock; BasicBlock* then_block = BasicBlock::Create( *ctx_, "if_then", function_); diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index c6dce7233fab..05972c3bcfe7 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -112,7 +112,6 @@ GetLLVMTargetMachine(const std::string& target_str, } else { opt.FloatABIType = llvm::FloatABI::Hard; } - LOG(WARNING) << "CPU: " << cpu; llvm::TargetMachine* tm = target->createTargetMachine( target_triple, cpu, attr, opt, llvm::Reloc::PIC_); return tm; From 0bce779c31db09082e6144c8a9ae0121d58e12e3 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 7 Sep 2017 10:57:54 -0400 Subject: [PATCH 18/44] fixed codegen depending on code review --- src/codegen/llvm/codegen_amdgpu.cc | 19 +++++++++---------- src/codegen/llvm/codegen_llvm.cc | 10 ++++++---- src/codegen/llvm/codegen_llvm.h | 3 +++ 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index b90e94a65082..b2b13d08c660 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -1,7 +1,7 @@ /*! * Copyright (c) 2017 by Contributors - * \file codegen_nvptx.cc - * \brief NVPTX code generator. + * \file codegen_amdgpu.cc + * \brief AMDGPU code generator. */ #ifdef TVM_LLVM_VERSION #if TVM_ROCM_RUNTIME @@ -21,14 +21,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { void AddFunction(const LoweredFunc& f) final { // add function as void return value CodeGenLLVM::AddFunctionInternal(f, true); - // annotate as kernel function -/* - module_->getOrInsertNamedMetadata("nvvm.annotations") - ->addOperand(llvm::MDNode::get(*ctx_, { - llvm::ValueAsMetadata::get(function_), - llvm::MDString::get(*ctx_, "kernel"), - llvm::ValueAsMetadata::get(ConstInt32(1)) })); -*/ + function_->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); } void VisitStmt_(const Allocate* op) final { @@ -124,6 +117,10 @@ class CodeGenAMDGPU : public CodeGenLLVM { // Additional optimization hook to tweak the builder. } + unsigned GetGlobalAddressSpace() { + return 1; + } + protected: void InitTarget(llvm::TargetMachine* tm) final { // Maximum vector lane = float4 @@ -160,6 +157,8 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { << "Cannot emit target CGFT_ObjectFile"; pass.run(*module); + LOG(WARNING) << printdest_ll; + std::string hsaco(data_hsaco.begin(), data_hsaco.end()); std::string ll(data_ll.begin(), data_ll.end()); return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index d04f9f5f9d6c..743575cb65aa 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -91,7 +91,6 @@ void CodeGenLLVM::AddFunction(const LoweredFunc& f) { } void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) { - bool isTargetAMD = target_machine_->getTarget().getName() == std::string("amdgcn"); this->InitFuncState(); is_restricted_ = f->is_restricted; CHECK(!module_->getFunction(f->name)) @@ -101,7 +100,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) { Type t = arg.type(); if (t.is_handle() && f->handle_data_type.count(arg)) { arg_type.push_back( - LLVMType(f->handle_data_type[arg].type())->getPointerTo(isTargetAMD ? 1 : 0)); + LLVMType(f->handle_data_type[arg].type())->getPointerTo(GetGlobalAddressSpace())); if (!is_restricted_) { alias_var_set_.insert(arg.get()); } @@ -114,8 +113,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) { ret_void ? t_void_ : t_int_, arg_type, false); // setup the function. function_ = llvm::cast(module_->getOrInsertFunction(f->name, ftype)); - function_->setCallingConv(isTargetAMD ? - llvm::CallingConv::AMDGPU_KERNEL : llvm::CallingConv::C); + function_->setCallingConv(llvm::CallingConv::C); // set handle argument to be non alias. if (is_restricted_) { for (size_t i = 0; i < f->args.size(); ++i) { @@ -557,6 +555,10 @@ int CodeGenLLVM::NativeVectorBits(const runtime::StorageScope& storage_scope) co return native_vector_bits_; } +unsigned CodeGenLLVM::GetGlobalAddressSpace() { + return 0; +} + void CodeGenLLVM::GetAlignment( Type t, const Variable* buf_var, const Expr& index, int* p_alignment, int* p_native_bits) { diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h index 3998217a4fed..d055d7d5a1c3 100644 --- a/src/codegen/llvm/codegen_llvm.h +++ b/src/codegen/llvm/codegen_llvm.h @@ -149,6 +149,9 @@ class CodeGenLLVM : virtual void Optimize(); // Get the maximim storage align bits of buffer pointer given storage scope. virtual int NativeVectorBits(const runtime::StorageScope& storage_scope) const; + // Get correct address space depending on the backend + virtual unsigned GetGlobalAddressSpace(); + void AddFunctionInternal(const LoweredFunc& f, bool ret_void); // Create extern call llvm::CallInst* CreateCallExtern(llvm::Type* ret, From ae276a38d671525a8980d7ec83887454956e356d Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 7 Sep 2017 13:19:25 -0400 Subject: [PATCH 19/44] reviewed alignment for amd devices --- src/codegen/llvm/codegen_amdgpu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index b2b13d08c660..ef79864a67ae 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -38,7 +38,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { if (constant_size % 4 == 0 && info.alignment == 0) { info.alignment = GetTempAllocaAlignment(op->type, constant_size); } - // maximum necessary alignment in the NV devices + // maximum necessary alignment in the AMD devices if (info.alignment > 16) { info.alignment = 16; } From 54d02d6b6c795e9cbe3e1af69728aa1f05788046 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 7 Sep 2017 15:55:50 -0400 Subject: [PATCH 20/44] added code to dump code object to file --- src/codegen/llvm/codegen_amdgpu.cc | 22 ++++++++++++++++++---- src/runtime/rocm/rocm_module.cc | 4 +++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index ef79864a67ae..e43eb1f8c11a 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -144,7 +144,7 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { cg->AddFunction(f); } std::unique_ptr module = cg->Finish(); - llvm::SmallString<8> data_hsaco, data_ll; + llvm::SmallString<8> data_hsaco, data_ll, data_isa; llvm::raw_svector_ostream dest_hsaco(data_hsaco), dest_ll(data_ll); dest_hsaco.SetUnbuffered(); dest_ll.SetUnbuffered(); @@ -152,13 +152,27 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { std::string printdest_ll(data_ll.begin(), data_ll.end()); llvm::legacy::PassManager pass; + // (TODO) adityaatluri: Generate CGFT_AssemblyFile for debugging kernels CHECK(tm->addPassesToEmitFile( - pass, dest_hsaco, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + pass, dest_hsaco, llvm::TargetMachine::CGFT_ObjectFile) == 0) << "Cannot emit target CGFT_ObjectFile"; - pass.run(*module); - LOG(WARNING) << printdest_ll; + auto FileName = "/tmp/output.o"; + std::error_code EC; + llvm::raw_fd_ostream dest(FileName, EC, llvm::sys::fs::F_None); + + llvm::legacy::PassManager p; + auto FileType = llvm::TargetMachine::CGFT_ObjectFile; + + if(tm->addPassesToEmitFile(p, dest, FileType)) { + LOG(FATAL) << "Couldn't dump to file"; + } + p.run(*module); + dest.flush(); + + + pass.run(*module); std::string hsaco(data_hsaco.begin(), data_hsaco.end()); std::string ll(data_ll.begin(), data_ll.end()); return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc index 7279d30554d4..16027f6e27ae 100644 --- a/src/runtime/rocm/rocm_module.cc +++ b/src/runtime/rocm/rocm_module.cc @@ -69,6 +69,7 @@ class ROCMModuleNode : public runtime::ModuleNode { hipFunction_t GetFunc(int device_id, const std::string& func_name) { std::lock_guard lock(mutex_); // must recheck under the lock scope + if (module_[device_id] == nullptr) { ROCM_DRIVER_CALL(hipModuleLoadData(&(module_[device_id]), data_.c_str())); } @@ -146,7 +147,9 @@ class ROCMWrappedFunc { if (fcache_[device_id] == nullptr) { fcache_[device_id] = m_->GetFunc(device_id, func_name_); } + hipStream_t strm = static_cast(ROCMThreadEntry::ThreadLocal()->stream); + ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); void* config[] = { HIP_LAUNCH_PARAM_BUFFER_POINTER, &packed_args, @@ -187,7 +190,6 @@ PackedFunc ROCMModuleNode::GetFunction( CHECK_EQ(sptr_to_self.get(), this); CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; - auto it = fmap_.find(name); if (it == fmap_.end()) return PackedFunc(); const FunctionInfo& info = it->second; From e6d532d857ad53ef36569cf07a69bdc503830c58 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 7 Sep 2017 16:10:45 -0400 Subject: [PATCH 21/44] fixed cpplint errors --- src/codegen/llvm/codegen_amdgpu.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index e43eb1f8c11a..30ab7a1a3a6d 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -164,8 +164,8 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { llvm::legacy::PassManager p; auto FileType = llvm::TargetMachine::CGFT_ObjectFile; - if(tm->addPassesToEmitFile(p, dest, FileType)) { - LOG(FATAL) << "Couldn't dump to file"; + if (tm->addPassesToEmitFile(p, dest, FileType)) { + LOG(FATAL) << "Couldn't dump to file"; } p.run(*module); From 52d8e2df357d7116ad6d90319d25d1253ee4b598 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 7 Sep 2017 18:24:14 -0400 Subject: [PATCH 22/44] print out IR after pass manager --- src/codegen/llvm/codegen_amdgpu.cc | 19 +++---------------- src/runtime/rocm/rocm_module.cc | 1 - 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 30ab7a1a3a6d..836cfea290fd 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -148,7 +148,7 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { llvm::raw_svector_ostream dest_hsaco(data_hsaco), dest_ll(data_ll); dest_hsaco.SetUnbuffered(); dest_ll.SetUnbuffered(); - module->print(dest_ll, nullptr); +// module->print(dest_ll, nullptr); std::string printdest_ll(data_ll.begin(), data_ll.end()); llvm::legacy::PassManager pass; @@ -157,24 +157,11 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { pass, dest_hsaco, llvm::TargetMachine::CGFT_ObjectFile) == 0) << "Cannot emit target CGFT_ObjectFile"; - auto FileName = "/tmp/output.o"; - std::error_code EC; - llvm::raw_fd_ostream dest(FileName, EC, llvm::sys::fs::F_None); - - llvm::legacy::PassManager p; - auto FileType = llvm::TargetMachine::CGFT_ObjectFile; - - if (tm->addPassesToEmitFile(p, dest, FileType)) { - LOG(FATAL) << "Couldn't dump to file"; - } - - p.run(*module); - dest.flush(); - - pass.run(*module); + module->print(dest_ll, nullptr); std::string hsaco(data_hsaco.begin(), data_hsaco.end()); std::string ll(data_ll.begin(), data_ll.end()); + LOG(WARNING) << ll; return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); } diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc index 16027f6e27ae..2839e10945f8 100644 --- a/src/runtime/rocm/rocm_module.cc +++ b/src/runtime/rocm/rocm_module.cc @@ -205,7 +205,6 @@ Module ROCMModuleCreate( std::string hip_source) { std::shared_ptr n = std::make_shared(data, fmt, fmap, hip_source); - LOG(WARNING) << hip_source; return Module(n); } From 11d7585252e35abf82fdcc5cb657c38802781878 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 11 Sep 2017 11:50:44 -0400 Subject: [PATCH 23/44] added code to dump asm, obj to file and std string --- src/codegen/llvm/codegen_amdgpu.cc | 96 +++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 14 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 836cfea290fd..c76ed95b9a8a 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -12,6 +12,14 @@ #include "../../pass/ir_util.h" #include "../../runtime/rocm/rocm_module.h" +namespace llvm { + extern "C" void LLVMInitializeAMDGPUTargetInfo(); + extern "C" void LLVMInitializeAMDGPUTarget(); + extern "C" void LLVMInitializeAMDGPUTargetMC(); + extern "C" void LLVMInitializeAMDGPUAsmParser(); + extern "C" void LLVMInitializeAMDGPUAsmPrinter(); +} + namespace tvm { namespace codegen { @@ -134,34 +142,94 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { CHECK(target.length( ) >= 4 && target.substr(0, 4) == "rocm"); - llvm::TargetMachine* tm = \ - GetLLVMTargetMachine("-mtriple=amdgcn--amdhsa -mcpu=gfx900" + \ +// llvm::TargetMachine* tm = \ + GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx900" + \ target.substr(4, target.length() - 4)); + auto TargetTriple = std::string("amdgcn-amd-amdhsa-hcc"); + + llvm::LLVMInitializeAMDGPUTargetInfo(); + llvm::LLVMInitializeAMDGPUTarget(); + llvm::LLVMInitializeAMDGPUTargetMC(); + llvm::LLVMInitializeAMDGPUAsmParser(); + llvm::LLVMInitializeAMDGPUAsmPrinter(); + + std::string Error; + auto Target = llvm::TargetRegistry::lookupTarget(TargetTriple, Error); + + if(!Target) { + LOG(WARNING) << Error; + } + + auto GPU = "gfx900"; + auto Features = ""; + + llvm::TargetOptions opt; + auto RM = llvm::Optional(); + auto tm = Target->createTargetMachine(TargetTriple, GPU, Features, opt, RM); + + + LOG(WARNING) << target; std::unique_ptr cg(new CodeGenAMDGPU()); std::unique_ptr ctx(new llvm::LLVMContext()); cg->Init(funcs[0]->name, tm, ctx.get(), false, false); for (LoweredFunc f : funcs) { cg->AddFunction(f); } + std::unique_ptr module = cg->Finish(); - llvm::SmallString<8> data_hsaco, data_ll, data_isa; - llvm::raw_svector_ostream dest_hsaco(data_hsaco), dest_ll(data_ll); - dest_hsaco.SetUnbuffered(); - dest_ll.SetUnbuffered(); -// module->print(dest_ll, nullptr); - std::string printdest_ll(data_ll.begin(), data_ll.end()); + llvm::SmallString<8> dataObj, data_ll, dataAsm; + llvm::raw_svector_ostream destObj(dataObj), dest_ll(data_ll), destAsm(dataAsm); + destObj.SetUnbuffered(); + dest_ll.SetUnbuffered(); + destAsm.SetUnbuffered(); + module->print(dest_ll, nullptr); + std::unique_ptr mAsm = llvm::CloneModule(module.get()); + std::unique_ptr mObj = llvm::CloneModule(module.get()); + std::unique_ptr mAsmFile = llvm::CloneModule(module.get()); + std::unique_ptr mObjFile = llvm::CloneModule(module.get()); llvm::legacy::PassManager pass; - // (TODO) adityaatluri: Generate CGFT_AssemblyFile for debugging kernels + + auto fnAsm = "output.s"; + auto fnObj = "output.co"; + std::error_code EC; + llvm::raw_fd_ostream destAsmFile(fnAsm, EC, llvm::sys::fs::F_None); + llvm::raw_fd_ostream destObjFile(fnObj, EC, llvm::sys::fs::F_None); + CHECK(tm->addPassesToEmitFile( - pass, dest_hsaco, llvm::TargetMachine::CGFT_ObjectFile) == 0) - << "Cannot emit target CGFT_ObjectFile"; + pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0) + << "Cannot emit target CGFT_ObjectFile"; + + CHECK(tm->addPassesToEmitFile( + pass, destAsm, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + << "Cannot emit target CGFT_AssemblyFile"; + + CHECK(tm->addPassesToEmitFile( + pass, destObjFile, llvm::TargetMachine::CGFT_ObjectFile) == 0) + << "Cannot emit target CGFT_ObjectFile"; + + CHECK(tm->addPassesToEmitFile( + pass, destAsmFile, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + << "Cannot emit target CGFT_AssemblyFile"; + + + pass.run(*mAsm); + pass.run(*mObj); + pass.run(*mAsmFile); + pass.run(*mObjFile); + + destAsmFile.flush(); + destObjFile.flush(); - pass.run(*module); - module->print(dest_ll, nullptr); - std::string hsaco(data_hsaco.begin(), data_hsaco.end()); std::string ll(data_ll.begin(), data_ll.end()); + std::string hsaco(dataObj.begin(), dataObj.end()); + std::string isa(dataAsm.begin(), dataAsm.end()); + LOG(WARNING) << ll; + LOG(WARNING) << isa; + + + return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); } From 1ca7418fbaa5e9deb4671a413a39e0453d1c1e5d Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 11 Sep 2017 11:52:19 -0400 Subject: [PATCH 24/44] fixed whitespaces --- src/codegen/llvm/codegen_amdgpu.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index c76ed95b9a8a..b98009438f48 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -156,7 +156,7 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { std::string Error; auto Target = llvm::TargetRegistry::lookupTarget(TargetTriple, Error); - if(!Target) { + if (!Target) { LOG(WARNING) << Error; } @@ -227,8 +227,6 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { LOG(WARNING) << ll; LOG(WARNING) << isa; - - return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); } From bb520d9dafbaeffbd29b016cdd9142a5d6d23915 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 11 Sep 2017 22:50:24 -0500 Subject: [PATCH 25/44] Update codegen_amdgpu.cc --- src/codegen/llvm/codegen_amdgpu.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index b98009438f48..1c4c49c8c75e 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -7,6 +7,7 @@ #if TVM_ROCM_RUNTIME #include +#include #include "./codegen_llvm.h" #include "../build_common.h" #include "../../pass/ir_util.h" From fb29bed2256d84a959739cfb3cbc18c99b3e474f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 11 Sep 2017 22:56:33 -0500 Subject: [PATCH 26/44] used registry for amdgpu llvm --- src/codegen/llvm/codegen_amdgpu.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 1c4c49c8c75e..e3563c8865f3 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -8,6 +8,7 @@ #include #include +#include #include "./codegen_llvm.h" #include "../build_common.h" #include "../../pass/ir_util.h" @@ -228,6 +229,9 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { LOG(WARNING) << ll; LOG(WARNING) << isa; + + const auto* f = Registry::Get("tvm_callback_rocm_link”); + CHECK(f != nullptr) << “Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm”; return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); } From 38805f5edea40411d2e62f349344f4cb9bab7dce Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 11 Sep 2017 23:03:25 -0500 Subject: [PATCH 27/44] Fixed whitespaces --- src/codegen/llvm/codegen_amdgpu.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index e3563c8865f3..724a43805158 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -229,8 +229,7 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { LOG(WARNING) << ll; LOG(WARNING) << isa; - - const auto* f = Registry::Get("tvm_callback_rocm_link”); + const auto* f = Registry::Get("tvm_callback_rocm_link"); CHECK(f != nullptr) << “Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm”; return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); From 8876cdeaced6d2e714bd540586149026ad3d8d5a Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 12:05:07 -0400 Subject: [PATCH 28/44] added code for calling linker --- src/codegen/llvm/codegen_amdgpu.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 724a43805158..58abda521f9a 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -232,6 +232,13 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { const auto* f = Registry::Get("tvm_callback_rocm_link"); CHECK(f != nullptr) << “Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm”; + std::string obj_blob; + TVMByteArray arr; + arr.data = &obj_blob[0]; + arr.size = obj_blob.length(); + + std::string hso = (*f)(obj_blob); + return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); } From b0c38f750a459af21c87e2bc56a2d11c08233c46 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 12:11:09 -0400 Subject: [PATCH 29/44] fixed formatting errors --- src/codegen/llvm/codegen_amdgpu.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 58abda521f9a..a55c1ef81400 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -229,8 +229,9 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { LOG(WARNING) << ll; LOG(WARNING) << isa; - const auto* f = Registry::Get("tvm_callback_rocm_link"); - CHECK(f != nullptr) << “Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm”; + + const auto* f = tvm::runtime::Registry::Get("tvm_callback_rocm_link"); + CHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm"; std::string obj_blob; TVMByteArray arr; From fcd7cc088354a29f70e5773571de9ccac2f81796 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 16:19:05 -0400 Subject: [PATCH 30/44] added rocm link python interface --- dmlc-core | 2 +- python/tvm/contrib/rocm.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 python/tvm/contrib/rocm.py diff --git a/dmlc-core b/dmlc-core index 46886a6b47f6..a527100d7d50 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 46886a6b47f660cda581e497378204ccc029a01e +Subproject commit a527100d7d5001efc4954848a2fc6027e48c05f4 diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py new file mode 100644 index 000000000000..ec62724bbb71 --- /dev/null +++ b/python/tvm/contrib/rocm.py @@ -0,0 +1,7 @@ +# tvm.contrib.rocm + +from . import util + +@tvm.register_func("tvm_callback_rocm_link") +def callback_rocm_link(obj_bin): + return obj_bin From 6e9a0e9baedbe4d2e30eefc3b6c369e926ba4b50 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 16:36:17 -0400 Subject: [PATCH 31/44] fixed pylint issues and added more body to the function --- python/tvm/contrib/rocm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index ec62724bbb71..c366bb0dcab9 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -4,4 +4,10 @@ @tvm.register_func("tvm_callback_rocm_link") def callback_rocm_link(obj_bin): - return obj_bin + tmp_dir = util.tempdir() + tmp_obj = temp_dir.reloath("rocm_kernel.o") + tmp_cobj = temp_dir.reloath("rocm_kernel.co") + with open(tmp_obj, "wb") as out_file: + out_file.write(bytes(obj_bin)) + cobj_bin = bytearray(open(temp_cobj, "rb").read()) + return cobj_bin From e57aa24ccaefe00485d8640198a9d2ddee2a45cb Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 16:45:23 -0400 Subject: [PATCH 32/44] added doc string --- python/tvm/contrib/rocm.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index c366bb0dcab9..a26d26a30431 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -4,10 +4,22 @@ @tvm.register_func("tvm_callback_rocm_link") def callback_rocm_link(obj_bin): + """Links object file generated from LLVM to HSA Code Object + + Parameters + ---------- + obj_bin : str + The object file + + Return + ------ + cobj_bin : str + The HSA Code Object + """ tmp_dir = util.tempdir() tmp_obj = temp_dir.reloath("rocm_kernel.o") tmp_cobj = temp_dir.reloath("rocm_kernel.co") with open(tmp_obj, "wb") as out_file: out_file.write(bytes(obj_bin)) - cobj_bin = bytearray(open(temp_cobj, "rb").read()) + cobj_bin = bytearray(open(tmp_cobj, "rb").read()) return cobj_bin From 84044e3a9c9b44e02a6e8aea2ddf5c73e45d5d63 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 16:50:38 -0400 Subject: [PATCH 33/44] added doc string for module --- python/tvm/contrib/rocm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index a26d26a30431..f244ea12f0e6 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -1,5 +1,8 @@ # tvm.contrib.rocm - +"""Utility to convert object file to HSA Code Object +The object file received from LLVM PassManager is relocatable ELF object. +The routine in this file uses lld to convert it to shared ELF object. +""" from . import util @tvm.register_func("tvm_callback_rocm_link") @@ -17,8 +20,8 @@ def callback_rocm_link(obj_bin): The HSA Code Object """ tmp_dir = util.tempdir() - tmp_obj = temp_dir.reloath("rocm_kernel.o") - tmp_cobj = temp_dir.reloath("rocm_kernel.co") + tmp_obj = tmp_dir.reloath("rocm_kernel.o") + tmp_cobj = tmp_dir.reloath("rocm_kernel.co") with open(tmp_obj, "wb") as out_file: out_file.write(bytes(obj_bin)) cobj_bin = bytearray(open(tmp_cobj, "rb").read()) From a6c053b59871c3baa812d64ab040a72f4eb433e2 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 17:13:25 -0400 Subject: [PATCH 34/44] fixed python code after review, fixed llvm object codegen --- python/tvm/contrib/rocm.py | 10 +++------- src/codegen/llvm/codegen_amdgpu.cc | 11 +++++++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index f244ea12f0e6..ccb64a77ce44 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -1,8 +1,4 @@ -# tvm.contrib.rocm -"""Utility to convert object file to HSA Code Object -The object file received from LLVM PassManager is relocatable ELF object. -The routine in this file uses lld to convert it to shared ELF object. -""" +"""Utility for ROCm backend""" from . import util @tvm.register_func("tvm_callback_rocm_link") @@ -11,12 +7,12 @@ def callback_rocm_link(obj_bin): Parameters ---------- - obj_bin : str + obj_bin : bytearray The object file Return ------ - cobj_bin : str + cobj_bin : bytearray The HSA Code Object """ tmp_dir = util.tempdir() diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index a55c1ef81400..31dea7b37b9f 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -201,7 +201,10 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { CHECK(tm->addPassesToEmitFile( pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0) << "Cannot emit target CGFT_ObjectFile"; + pass.run(*mObj); + std::string obj(dataObj.begin(), dataObj.end()); +/* CHECK(tm->addPassesToEmitFile( pass, destAsm, llvm::TargetMachine::CGFT_AssemblyFile) == 0) << "Cannot emit target CGFT_AssemblyFile"; @@ -229,16 +232,16 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { LOG(WARNING) << ll; LOG(WARNING) << isa; +*/ const auto* f = tvm::runtime::Registry::Get("tvm_callback_rocm_link"); CHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm"; - std::string obj_blob; TVMByteArray arr; - arr.data = &obj_blob[0]; - arr.size = obj_blob.length(); + arr.data = &obj[0]; + arr.size = obj.length(); - std::string hso = (*f)(obj_blob); + std::string hsaco = (*f)(arr), ll; return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); } From c218cd388dcc75f6166d3406e055865b41f53e09 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 18:26:25 -0400 Subject: [PATCH 35/44] fixed linker to generate code object --- python/tvm/__init__.py | 1 + python/tvm/contrib/rocm.py | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index ecb6200807fb..f3f908f2ef53 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -29,3 +29,4 @@ from .schedule import create_schedule from .build_module import build, lower, build_config from .tag import tag_scope +from .contrib import rocm as _rocm diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index ccb64a77ce44..9fbce79f91ac 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -1,7 +1,22 @@ """Utility for ROCm backend""" +import sys, subprocess from . import util +from ..api import register_func -@tvm.register_func("tvm_callback_rocm_link") +def rocm_link(in_file, out_file): + cmd = ["ld.lld"] + cmd += ["-shared"] + cmd += [in_file] + cmd += ["-o"] + cmd += [out_file] + args = ' '.join(cmd) + proc = subprocess.Popen( + args, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = proc.communicate() + +@register_func("tvm_callback_rocm_link") def callback_rocm_link(obj_bin): """Links object file generated from LLVM to HSA Code Object @@ -16,9 +31,11 @@ def callback_rocm_link(obj_bin): The HSA Code Object """ tmp_dir = util.tempdir() - tmp_obj = tmp_dir.reloath("rocm_kernel.o") - tmp_cobj = tmp_dir.reloath("rocm_kernel.co") + tmp_obj = tmp_dir.relpath("rocm_kernel.o") + tmp_cobj = tmp_dir.relpath("rocm_kernel.co") + rocm_link(tmp_obj, tmp_cobj) with open(tmp_obj, "wb") as out_file: out_file.write(bytes(obj_bin)) + rocm_link(tmp_obj, tmp_cobj) cobj_bin = bytearray(open(tmp_cobj, "rb").read()) return cobj_bin From 1afa473d3674a9b4753a04ec3b2b64e9392b00be Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 18:46:51 -0400 Subject: [PATCH 36/44] removed dumping to output file and debugging log out --- src/codegen/llvm/codegen_amdgpu.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index 31dea7b37b9f..fe070e5b131e 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -192,11 +192,6 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { std::unique_ptr mObjFile = llvm::CloneModule(module.get()); llvm::legacy::PassManager pass; - auto fnAsm = "output.s"; - auto fnObj = "output.co"; - std::error_code EC; - llvm::raw_fd_ostream destAsmFile(fnAsm, EC, llvm::sys::fs::F_None); - llvm::raw_fd_ostream destObjFile(fnObj, EC, llvm::sys::fs::F_None); CHECK(tm->addPassesToEmitFile( pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0) From 8fd4efc8e9e43850ed3bd2f2b5bc3e2f024eb523 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 12 Sep 2017 18:52:42 -0400 Subject: [PATCH 37/44] fixed lint for python code --- python/tvm/contrib/rocm.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index 9fbce79f91ac..af7bad64dd91 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -1,9 +1,19 @@ """Utility for ROCm backend""" -import sys, subprocess +import subprocess from . import util from ..api import register_func def rocm_link(in_file, out_file): + """Link relocatable ELF object to shared ELF object using lld + + Parameters + ---------- + in_file : str + Input file name (relocatable ELF object file) + + out_file : str + Output file name (shared ELF object file) + """ cmd = ["ld.lld"] cmd += ["-shared"] cmd += [in_file] @@ -14,7 +24,7 @@ def rocm_link(in_file, out_file): args, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = proc.communicate() + proc.communicate() @register_func("tvm_callback_rocm_link") def callback_rocm_link(obj_bin): From 80dceee3935db646f8609ca571c1f2cf20a2925e Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 13 Sep 2017 12:10:22 -0400 Subject: [PATCH 38/44] added fault check after running linker --- python/tvm/contrib/rocm.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index af7bad64dd91..4bb2b86e9fed 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -1,4 +1,5 @@ """Utility for ROCm backend""" +import sys import subprocess from . import util from ..api import register_func @@ -14,17 +15,18 @@ def rocm_link(in_file, out_file): out_file : str Output file name (shared ELF object file) """ - cmd = ["ld.lld"] - cmd += ["-shared"] - cmd += [in_file] - cmd += ["-o"] - cmd += [out_file] - args = ' '.join(cmd) + args = "ld.lld -shared " + in_file + " -o " + out_file + print args proc = subprocess.Popen( args, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - proc.communicate() + (out, _) = proc.communicate() + + if proc.returncode != 0: + sys.stderr.write("Linking error using ld.lld:\n") + sys.stderr.write(str(out)) + sys.stderr.flush() @register_func("tvm_callback_rocm_link") def callback_rocm_link(obj_bin): @@ -43,7 +45,6 @@ def callback_rocm_link(obj_bin): tmp_dir = util.tempdir() tmp_obj = tmp_dir.relpath("rocm_kernel.o") tmp_cobj = tmp_dir.relpath("rocm_kernel.co") - rocm_link(tmp_obj, tmp_cobj) with open(tmp_obj, "wb") as out_file: out_file.write(bytes(obj_bin)) rocm_link(tmp_obj, tmp_cobj) From c3b39ca5091e171af6e9ab3fd78f16fa96dd75ff Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 13 Sep 2017 12:13:57 -0400 Subject: [PATCH 39/44] removed print statement in rocm.py --- python/tvm/contrib/rocm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index 4bb2b86e9fed..24d2b11297cb 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -16,7 +16,6 @@ def rocm_link(in_file, out_file): Output file name (shared ELF object file) """ args = "ld.lld -shared " + in_file + " -o " + out_file - print args proc = subprocess.Popen( args, shell=True, stdout=subprocess.PIPE, From 678cb41c58affe9fd14ebaf33a583cd5d4a92c26 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 13 Sep 2017 12:44:17 -0400 Subject: [PATCH 40/44] changed rocm lld linker to raise runtimeerror than emitting error log to stderr --- python/tvm/contrib/rocm.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index 24d2b11297cb..f7c267bf3a79 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -1,5 +1,4 @@ """Utility for ROCm backend""" -import sys import subprocess from . import util from ..api import register_func @@ -23,9 +22,9 @@ def rocm_link(in_file, out_file): (out, _) = proc.communicate() if proc.returncode != 0: - sys.stderr.write("Linking error using ld.lld:\n") - sys.stderr.write(str(out)) - sys.stderr.flush() + msg = "Linking error using ld.lld:\n" + msg += str(out) + raise RuntimeError(msg) @register_func("tvm_callback_rocm_link") def callback_rocm_link(obj_bin): From 29b60e9824dfd471acea3e8477d4f438ebefade7 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 13 Sep 2017 12:45:55 -0400 Subject: [PATCH 41/44] changed the way linker command line is pass to subprocess.popen --- python/tvm/contrib/rocm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index f7c267bf3a79..c367aef24e21 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -14,9 +14,9 @@ def rocm_link(in_file, out_file): out_file : str Output file name (shared ELF object file) """ - args = "ld.lld -shared " + in_file + " -o " + out_file + args = ["ld.lld", "-shared", in_file, "-o", out_file] proc = subprocess.Popen( - args, shell=True, + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = proc.communicate() From e48b48ece35414cb82a46f79a6f59b91a7f48d5b Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 13 Sep 2017 12:50:19 -0400 Subject: [PATCH 42/44] removed redundant code and reuse tvm utils --- src/codegen/llvm/codegen_amdgpu.cc | 33 +----------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index fe070e5b131e..ef99fc11cd6a 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -14,14 +14,6 @@ #include "../../pass/ir_util.h" #include "../../runtime/rocm/rocm_module.h" -namespace llvm { - extern "C" void LLVMInitializeAMDGPUTargetInfo(); - extern "C" void LLVMInitializeAMDGPUTarget(); - extern "C" void LLVMInitializeAMDGPUTargetMC(); - extern "C" void LLVMInitializeAMDGPUAsmParser(); - extern "C" void LLVMInitializeAMDGPUAsmPrinter(); -} - namespace tvm { namespace codegen { @@ -144,33 +136,10 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { CHECK(target.length( ) >= 4 && target.substr(0, 4) == "rocm"); -// llvm::TargetMachine* tm = \ + llvm::TargetMachine* tm = \ GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx900" + \ target.substr(4, target.length() - 4)); - auto TargetTriple = std::string("amdgcn-amd-amdhsa-hcc"); - - llvm::LLVMInitializeAMDGPUTargetInfo(); - llvm::LLVMInitializeAMDGPUTarget(); - llvm::LLVMInitializeAMDGPUTargetMC(); - llvm::LLVMInitializeAMDGPUAsmParser(); - llvm::LLVMInitializeAMDGPUAsmPrinter(); - - std::string Error; - auto Target = llvm::TargetRegistry::lookupTarget(TargetTriple, Error); - - if (!Target) { - LOG(WARNING) << Error; - } - - auto GPU = "gfx900"; - auto Features = ""; - - llvm::TargetOptions opt; - auto RM = llvm::Optional(); - auto tm = Target->createTargetMachine(TargetTriple, GPU, Features, opt, RM); - - LOG(WARNING) << target; std::unique_ptr cg(new CodeGenAMDGPU()); std::unique_ptr ctx(new llvm::LLVMContext()); cg->Init(funcs[0]->name, tm, ctx.get(), false, false); From 939e3ef30d5c18a7af2bf99ad07a3bfba46d5c39 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 13 Sep 2017 13:17:51 -0400 Subject: [PATCH 43/44] removed commented out code --- src/codegen/llvm/codegen_amdgpu.cc | 31 ------------------------------ 1 file changed, 31 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index ef99fc11cd6a..ad8857e125a1 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -132,7 +132,6 @@ class CodeGenAMDGPU : public CodeGenLLVM { }; runtime::Module BuildAMDGPU(Array funcs, std::string target) { - CHECK(1) << target; CHECK(target.length( ) >= 4 && target.substr(0, 4) == "rocm"); @@ -168,36 +167,6 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { pass.run(*mObj); std::string obj(dataObj.begin(), dataObj.end()); -/* - CHECK(tm->addPassesToEmitFile( - pass, destAsm, llvm::TargetMachine::CGFT_AssemblyFile) == 0) - << "Cannot emit target CGFT_AssemblyFile"; - - CHECK(tm->addPassesToEmitFile( - pass, destObjFile, llvm::TargetMachine::CGFT_ObjectFile) == 0) - << "Cannot emit target CGFT_ObjectFile"; - - CHECK(tm->addPassesToEmitFile( - pass, destAsmFile, llvm::TargetMachine::CGFT_AssemblyFile) == 0) - << "Cannot emit target CGFT_AssemblyFile"; - - - pass.run(*mAsm); - pass.run(*mObj); - pass.run(*mAsmFile); - pass.run(*mObjFile); - - destAsmFile.flush(); - destObjFile.flush(); - - std::string ll(data_ll.begin(), data_ll.end()); - std::string hsaco(dataObj.begin(), dataObj.end()); - std::string isa(dataAsm.begin(), dataAsm.end()); - - LOG(WARNING) << ll; - LOG(WARNING) << isa; -*/ - const auto* f = tvm::runtime::Registry::Get("tvm_callback_rocm_link"); CHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm"; From 4501e8e56c42e19fc4badd454c6395eb1ceacf26 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 13 Sep 2017 14:14:02 -0400 Subject: [PATCH 44/44] removed cloning of unused modules, and put IR into string --- src/codegen/llvm/codegen_amdgpu.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc index ad8857e125a1..4769efdb0405 100644 --- a/src/codegen/llvm/codegen_amdgpu.cc +++ b/src/codegen/llvm/codegen_amdgpu.cc @@ -156,11 +156,8 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { module->print(dest_ll, nullptr); std::unique_ptr mAsm = llvm::CloneModule(module.get()); std::unique_ptr mObj = llvm::CloneModule(module.get()); - std::unique_ptr mAsmFile = llvm::CloneModule(module.get()); - std::unique_ptr mObjFile = llvm::CloneModule(module.get()); llvm::legacy::PassManager pass; - CHECK(tm->addPassesToEmitFile( pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0) << "Cannot emit target CGFT_ObjectFile"; @@ -174,7 +171,8 @@ runtime::Module BuildAMDGPU(Array funcs, std::string target) { arr.data = &obj[0]; arr.size = obj.length(); - std::string hsaco = (*f)(arr), ll; + std::string hsaco = (*f)(arr); + std::string ll(data_ll.begin(), data_ll.end()); return ROCMModuleCreate(hsaco, "hsaco", ExtractFuncInfo(funcs), ll); }