From 07057f45c8d90bc79bff3d5df8e86ce39937abfb Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 12 Apr 2021 16:10:21 -0700 Subject: [PATCH 1/6] Refactor OpenCL runtime module to build separate cl_programs for each kernel. This can avoid pathological bugs in the vendor specific OpenCL compiler that may be triggered with large programs. --- src/runtime/opencl/opencl_common.h | 8 ++-- src/runtime/opencl/opencl_module.cc | 65 +++++++++++++++++++++-------- src/target/source/codegen_opencl.cc | 18 ++++---- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index b4377119e4c7..363e0d3b5c0e 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -329,14 +329,14 @@ class OpenCLModuleNode : public ModuleNode { std::mutex build_lock_; // The OpenCL source. std::string source_; - // the binary data - cl_program program_{nullptr}; - // build info - std::vector device_built_flag_; + // Mapping from primitive name to cl program for each device. + std::unordered_map> programs_; // kernel id cache std::unordered_map kid_map_; // kernels build so far. std::vector kernels_; + // parsed kernel data + std::unordered_map parsed_kernels_; }; } // namespace runtime diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 8c22c3c8cb23..a61302baa482 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -105,8 +105,14 @@ OpenCLModuleNode::~OpenCLModuleNode() { for (cl_kernel k : kernels_) { OPENCL_CALL(clReleaseKernel(k)); } - if (program_) { - OPENCL_CALL(clReleaseProgram(program_)); + if (programs_.size()) { + for (auto& kv : programs_) { + for (auto& program : kv.second) { + if (program) { + OPENCL_CALL(clReleaseProgram(program)); + } + } + } } } @@ -166,7 +172,6 @@ std::string OpenCLModuleNode::GetSource(const std::string& format) { void OpenCLModuleNode::Init() { workspace_ = GetGlobalWorkspace(); workspace_->Init(); - device_built_flag_.resize(workspace_->devices.size(), false); // initialize the kernel id, need to lock global table. std::lock_guard lock(workspace_->mu); for (const auto& kv : fmap_) { @@ -181,28 +186,53 @@ void OpenCLModuleNode::Init() { e.version = workspace_->timestamp++; kid_map_[key] = e; } + + std::string source = GetSource("cl"); + if (source.size()) { + std::string del{"// Function: "}; + size_t end; + size_t begin = source.find(del); + ICHECK(begin != std::string::npos) << "The OpenCL module expects a kernel delimited " + << "source from code generation, but no kernel " + << "delimiter was found."; + while (true) { + begin += del.size(); + end = source.find('\n', begin); + std::string func_name = source.substr(begin, end-begin); + begin = ++end; + // std::string::substr returns either start of next kernel + // or std::string::npos, in the latter case substr returns + // all characters until the end of the source string. + end = source.find(del, begin); + std::string func_source = source.substr(begin, (end == std::string::npos) ? end : end-begin); + parsed_kernels_.insert({func_name, func_source}); + begin = end; + if (end == std::string::npos) { break; } + } + } + for (auto& kv : parsed_kernels_) { + programs_.insert({kv.first, std::vector(workspace_->devices.size(), nullptr)}); + } } cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t, const std::string& func_name, const KTRefEntry& e) { std::lock_guard lock(build_lock_); int device_id = t->device.device_id; - if (!device_built_flag_[device_id]) { + if (programs_[func_name][device_id] == nullptr) { // create program if (fmt_ == "cl") { - if (program_ == nullptr) { - const char* s = data_.c_str(); - size_t len = data_.length(); - cl_int err; - program_ = clCreateProgramWithSource(w->context, 1, &s, &len, &err); - OPENCL_CHECK_ERROR(err); - } + const char* s = parsed_kernels_[func_name].c_str(); + size_t len = parsed_kernels_[func_name].length(); + cl_int err; + programs_[func_name][device_id] = clCreateProgramWithSource(w->context, 1, &s, &len, &err); + OPENCL_CHECK_ERROR(err); } else if (fmt_ == "xclbin" || fmt_ == "awsxclbin" || fmt_ == "aocx") { const unsigned char* s = (const unsigned char*)data_.c_str(); size_t len = data_.length(); cl_int err; cl_device_id dev = w->devices[device_id]; - program_ = clCreateProgramWithBinary(w->context, 1, &dev, &len, &s, NULL, &err); + programs_[func_name][device_id] = clCreateProgramWithBinary(w->context, 1, &dev, &len, &s, NULL, &err); OPENCL_CHECK_ERROR(err); } else { LOG(FATAL) << "Unknown OpenCL format " << fmt_; @@ -210,20 +240,19 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre // build program cl_int err; cl_device_id dev = w->devices[device_id]; - err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr); + err = clBuildProgram(programs_[func_name][device_id], 1, &dev, nullptr, nullptr, nullptr); if (err != CL_SUCCESS) { size_t len; std::string log; - clGetProgramBuildInfo(program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len); + clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len); log.resize(len); - clGetProgramBuildInfo(program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr); - LOG(FATAL) << "OpenCL build error for device=" << dev << log; + clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr); + LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log; } - device_built_flag_[device_id] = true; } // build kernel cl_int err; - cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err); + cl_kernel kernel = clCreateKernel(programs_[func_name][device_id], func_name.c_str(), &err); OPENCL_CHECK_ERROR(err); t->kernel_table[e.kernel_id].kernel = kernel; t->kernel_table[e.kernel_id].version = e.version; diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index f72f3f265511..edb614d9c122 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -283,23 +283,27 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) { // N runtime::Module BuildOpenCL(IRModule mod, Target target) { using tvm::runtime::Registry; bool output_ssa = false; - CodeGenOpenCL cg; - cg.Init(output_ssa); + std::stringstream code; + const auto* fpostproc = Registry::Get("tvm_callback_opencl_postproc"); for (auto kv : mod->functions) { ICHECK(kv.second->IsInstance()) << "CodeGenOpenCL: Can only take PrimFunc"; + code << "// Function: " << kv.first->name_hint << std::endl; + CodeGenOpenCL cg; + cg.Init(output_ssa); auto f = Downcast(kv.second); auto calling_conv = f->GetAttr(tvm::attr::kCallingConv); ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch) << "CodeGenOpenCL: expect calling_conv equals CallingConv::kDeviceKernelLaunch"; cg.AddFunction(f); + std::string fsource = cg.Finish(); + if (fpostproc) { + fsource = (*fpostproc)(fsource).operator std::string(); + } + code << fsource; } - std::string code = cg.Finish(); - if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) { - code = (*f)(code).operator std::string(); - } - return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(mod), code); + return OpenCLModuleCreate(code.str(), "cl", ExtractFuncInfo(mod), code.str()); } TVM_REGISTER_GLOBAL("target.build.opencl").set_body_typed(BuildOpenCL); From 6f9c3435c1d91a89b5828d3bda5a1ed4a934b6aa Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 12 Apr 2021 21:57:41 -0700 Subject: [PATCH 2/6] clang-format --- src/runtime/opencl/opencl_module.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index a61302baa482..e4fd33cc70ee 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -198,16 +198,19 @@ void OpenCLModuleNode::Init() { while (true) { begin += del.size(); end = source.find('\n', begin); - std::string func_name = source.substr(begin, end-begin); + std::string func_name = source.substr(begin, end - begin); begin = ++end; // std::string::substr returns either start of next kernel // or std::string::npos, in the latter case substr returns // all characters until the end of the source string. end = source.find(del, begin); - std::string func_source = source.substr(begin, (end == std::string::npos) ? end : end-begin); + std::string func_source = + source.substr(begin, (end == std::string::npos) ? end : end - begin); parsed_kernels_.insert({func_name, func_source}); begin = end; - if (end == std::string::npos) { break; } + if (end == std::string::npos) { + break; + } } } for (auto& kv : parsed_kernels_) { @@ -232,7 +235,8 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre size_t len = data_.length(); cl_int err; cl_device_id dev = w->devices[device_id]; - programs_[func_name][device_id] = clCreateProgramWithBinary(w->context, 1, &dev, &len, &s, NULL, &err); + programs_[func_name][device_id] = + clCreateProgramWithBinary(w->context, 1, &dev, &len, &s, NULL, &err); OPENCL_CHECK_ERROR(err); } else { LOG(FATAL) << "Unknown OpenCL format " << fmt_; @@ -244,9 +248,11 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre if (err != CL_SUCCESS) { size_t len; std::string log; - clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len); + clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, + &len); log.resize(len); - clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr); + clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, + &log[0], nullptr); LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log; } } From 29c3163088b5181351dfcb353782359384cac701 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 12 Apr 2021 22:29:06 -0700 Subject: [PATCH 3/6] Remove check on program size when deconstructing. --- src/runtime/opencl/opencl_module.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index e4fd33cc70ee..2bcb172cb055 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -105,12 +105,11 @@ OpenCLModuleNode::~OpenCLModuleNode() { for (cl_kernel k : kernels_) { OPENCL_CALL(clReleaseKernel(k)); } - if (programs_.size()) { - for (auto& kv : programs_) { - for (auto& program : kv.second) { - if (program) { - OPENCL_CALL(clReleaseProgram(program)); - } + // free the programs + for (auto& kv : programs_) { + for (auto& program : kv.second) { + if (program) { + OPENCL_CALL(clReleaseProgram(program)); } } } @@ -187,6 +186,8 @@ void OpenCLModuleNode::Init() { kid_map_[key] = e; } + // Use function delimiters to parse the serialized source + // into separate source files for each kernel primitive std::string source = GetSource("cl"); if (source.size()) { std::string del{"// Function: "}; From 1e413fb6af796e007e37048740954aae3bda2c59 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 13 Apr 2021 13:48:25 -0700 Subject: [PATCH 4/6] Refactor into SplitKernels method. --- src/runtime/opencl/opencl_common.h | 7 ++++ src/runtime/opencl/opencl_module.cc | 62 ++++++++++++++++------------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 363e0d3b5c0e..991317485115 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -315,6 +315,13 @@ class OpenCLModuleNode : public ModuleNode { cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t, const std::string& func_name, const KTRefEntry& e); + /* + * \brief Splits the provided serialized source file into separate + * source for each kernel primitive. + * \param source The serialized program source file (fmt: cl) + */ + std::unordered_map SplitKernels(std::string source) const; + private: // The workspace, need to keep reference to use it in destructor. // In case of static destruction order problem. diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 2bcb172cb055..f0a3e076cccf 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -186,34 +186,9 @@ void OpenCLModuleNode::Init() { kid_map_[key] = e; } - // Use function delimiters to parse the serialized source - // into separate source files for each kernel primitive - std::string source = GetSource("cl"); - if (source.size()) { - std::string del{"// Function: "}; - size_t end; - size_t begin = source.find(del); - ICHECK(begin != std::string::npos) << "The OpenCL module expects a kernel delimited " - << "source from code generation, but no kernel " - << "delimiter was found."; - while (true) { - begin += del.size(); - end = source.find('\n', begin); - std::string func_name = source.substr(begin, end - begin); - begin = ++end; - // std::string::substr returns either start of next kernel - // or std::string::npos, in the latter case substr returns - // all characters until the end of the source string. - end = source.find(del, begin); - std::string func_source = - source.substr(begin, (end == std::string::npos) ? end : end - begin); - parsed_kernels_.insert({func_name, func_source}); - begin = end; - if (end == std::string::npos) { - break; - } - } - } + // split into source artifacts for each kernel + parsed_kernels_ = SplitKernels(GetSource("cl")); + // zero initialize cl_program pointers for each device kernel for (auto& kv : parsed_kernels_) { programs_.insert({kv.first, std::vector(workspace_->devices.size(), nullptr)}); } @@ -267,6 +242,37 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre return kernel; } +std::unordered_map OpenCLModuleNode::SplitKernels( + std::string source) const { + std::unordered_map split_kernels; + if (source.size()) { + std::string del{"// Function: "}; + size_t end; + size_t begin = source.find(del); + ICHECK(begin != std::string::npos) << "The OpenCL module expects a kernel delimited " + << "source from code generation, but no kernel " + << "delimiter was found."; + while (true) { + begin += del.size(); + end = source.find('\n', begin); + std::string func_name = source.substr(begin, end - begin); + begin = ++end; + // std::string::substr returns either start of next kernel + // or std::string::npos, in the latter case substr returns + // all characters until the end of the source string. + end = source.find(del, begin); + std::string func_source = + source.substr(begin, (end == std::string::npos) ? end : end - begin); + split_kernels.insert({func_name, func_source}); + begin = end; + if (end == std::string::npos) { + break; + } + } + } + return split_kernels; +} + Module OpenCLModuleCreate(std::string data, std::string fmt, std::unordered_map fmap, std::string source) { auto n = make_object(data, fmt, fmap, source); From 5adf4315d199f7185e6ed618b93d36edf6bc8c17 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 15 Apr 2021 15:17:15 -0700 Subject: [PATCH 5/6] Limit number of loops for kernel parsing --- src/runtime/opencl/opencl_module.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index f0a3e076cccf..6543b1de460c 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -252,7 +252,7 @@ std::unordered_map OpenCLModuleNode::SplitKernels( ICHECK(begin != std::string::npos) << "The OpenCL module expects a kernel delimited " << "source from code generation, but no kernel " << "delimiter was found."; - while (true) { + for (size_t num_kernels = 0; num_kernels < workspace_->num_registered_kernels; num_kernels++) { begin += del.size(); end = source.find('\n', begin); std::string func_name = source.substr(begin, end - begin); @@ -270,6 +270,8 @@ std::unordered_map OpenCLModuleNode::SplitKernels( } } } + ICHECK_EQ(workspace_->num_registered_kernels, split_kernels.size()) + << "The number of registered kernels does not match number of parsed kernel sources"; return split_kernels; } From 1295763226e29bb9ab198de3aeed403b8961e7cf Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 30 Apr 2021 20:26:57 -0700 Subject: [PATCH 6/6] Add return doc for SplitKernels per CR. --- src/runtime/opencl/opencl_common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 991317485115..d74a529595a2 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -319,6 +319,7 @@ class OpenCLModuleNode : public ModuleNode { * \brief Splits the provided serialized source file into separate * source for each kernel primitive. * \param source The serialized program source file (fmt: cl) + * \return Mapping from primitive name to kernel source */ std::unordered_map SplitKernels(std::string source) const;