From 469741489bd9693fd7cf6ac2f5074ed31ffa9201 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 13 Oct 2020 14:50:17 -0700
Subject: [PATCH 1/5] Guard RTC better

---
 src/common/cuda/rtc.cc | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 8f3b3391f5e4..4105a28f98b4 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -87,6 +87,31 @@ std::string GetPtx(nvrtcProgram program) {
   return ptx;
 }
 
+std::string GetArchString(const int sm_arch) {
+#if CUDA_VERSION < 10000
+  constexpr int max_supported_sm_arch = 70;
+#elif CUDA_VERSION < 11000
+  constexpr int max_supported_sm_arch = 75;
+#elif CUDA_VERSION < 11010
+  constexpr int max_supported_sm_arch = 80;
+#else
+  constexpr int max_supported_sm_arch = 86;
+#endif
+
+#if CUDA_VERSION <= 11000
+  // Always use PTX for CUDA <= 11.0
+  const bool known_arch = false;
+#else
+  const bool known_arch = sm_arch <= max_supported_sm_arch;
+#endif
+  const int actual_sm_arch = std::min(sm_arch, max_supported_sm_arch);
+  if (known_arch) {
+    return "sm_" + std::to_string(actual_sm_arch);
+  } else {
+    return "compute_" + std::to_string(actual_sm_arch);
+  }
+}
+
 }  // namespace
 
 CUfunction get_function(const std::string &parameters,
@@ -141,8 +166,7 @@ CUfunction get_function(const std::string &parameters,
                                   0,                                         // num headers
                                   nullptr,                                   // headers
                                   nullptr));                                 // include names
-
-    std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
+    std::string gpu_arch_arg = "--gpu-architecture=" + GetArchString(sm_arch);
     const char *opts[] = {gpu_arch_arg.c_str(),
 #if NDEBUG == 0
                           "-G",

From 8ff0b9e0a2b8c0e2be219ec8ac764da7245ab38d Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 16 Oct 2020 11:04:36 -0700
Subject: [PATCH 2/5] Use nvrtcGetCUBIN

---
 src/common/cuda/rtc.cc | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 4105a28f98b4..6acf2c52b75a 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -78,16 +78,23 @@ std::string GetCompileLog(nvrtcProgram program) {
 }
 
 // Obtain compilation result (ptx assembly) from the program.
-std::string GetPtx(nvrtcProgram program) {
+std::string GetCompiledCode(nvrtcProgram program, bool use_cubin) {
+#if CUDA_VERSION >= 11010
+  const auto getSize = use_cubin ? nvrtcGetCUBINSize : nvrtcGetPTXSize;
+  const auto getFunc = use_cubin ? nvrtcGetCUBIN : nvrtcGetPTX;
+#else
+  const auto getSize = nvrtcGetPTXSize;
+  const auto getFunc = nvrtcGetPTX;
+#endif
   size_t ptx_size_including_null;
-  NVRTC_CALL(nvrtcGetPTXSize(program, &ptx_size_including_null));
+  NVRTC_CALL(getSize(program, &ptx_size_including_null));
   std::string ptx(ptx_size_including_null - 1, '\0');
   // Room for terminating null character ensured since C++11
-  NVRTC_CALL(nvrtcGetPTX(program, &ptx[0]));
+  NVRTC_CALL(getFunc(program, &ptx[0]));
   return ptx;
 }
 
-std::string GetArchString(const int sm_arch) {
+std::tuple<bool, std::string> GetArchString(const int sm_arch) {
 #if CUDA_VERSION < 10000
   constexpr int max_supported_sm_arch = 70;
 #elif CUDA_VERSION < 11000
@@ -106,9 +113,9 @@ std::string GetArchString(const int sm_arch) {
 #endif
   const int actual_sm_arch = std::min(sm_arch, max_supported_sm_arch);
   if (known_arch) {
-    return "sm_" + std::to_string(actual_sm_arch);
+    return {known_arch, "sm_" + std::to_string(actual_sm_arch)};
   } else {
-    return "compute_" + std::to_string(actual_sm_arch);
+    return {known_arch, "compute_" + std::to_string(actual_sm_arch)};
   }
 }
 
@@ -166,13 +173,14 @@ CUfunction get_function(const std::string &parameters,
                                   0,                                         // num headers
                                   nullptr,                                   // headers
                                   nullptr));                                 // include names
-    std::string gpu_arch_arg = "--gpu-architecture=" + GetArchString(sm_arch);
+    const auto [use_cubin, gpu_arch] = GetArchString(sm_arch);
+    std::string gpu_arch_arg = "--gpu-architecture=" + gpu_arch;
     const char *opts[] = {gpu_arch_arg.c_str(),
 #if NDEBUG == 0
                           "-G",
 #endif
                           "--std=c++14"};
-    const std::string kernel_name_demangled = kernel_name;
+    const std::string& kernel_name_demangled = kernel_name;
     NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
 
     nvrtcResult compileResult = nvrtcCompileProgram(program,                         // prog
@@ -189,7 +197,7 @@ CUfunction get_function(const std::string &parameters,
         << "The generated code was stored in " << dump_file << "\n"
         << GetCompileLog(program);
 
-    kinfo.ptx = GetPtx(program);
+    kinfo.ptx = GetCompiledCode(program, use_cubin);
     const char *mangled_name;
     NVRTC_CALL(nvrtcGetLoweredName(program,
                                    kernel_name_demangled.c_str(),

From 869aaa5249525290ac80e76b3c7e23580168a9bd Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 16 Oct 2020 15:05:13 -0700
Subject: [PATCH 3/5] Fix lint

---
 src/common/cuda/rtc.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 6acf2c52b75a..b2b5b4d790ce 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -28,6 +28,8 @@
 #include <fstream>
 #include <unordered_map>
 #include <vector>
+#include <tuple>
+#include <algorithm>
 
 #include "rtc.h"
 #include "rtc/half-inl.h"
@@ -173,7 +175,7 @@ CUfunction get_function(const std::string &parameters,
                                   0,                                         // num headers
                                   nullptr,                                   // headers
                                   nullptr));                                 // include names
-    const auto [use_cubin, gpu_arch] = GetArchString(sm_arch);
+    const auto [use_cubin, gpu_arch] = GetArchString(sm_arch);  // NOLINT(*)
     std::string gpu_arch_arg = "--gpu-architecture=" + gpu_arch;
     const char *opts[] = {gpu_arch_arg.c_str(),
 #if NDEBUG == 0

From e882040a58cc04207efe74996502ce07862eb5f0 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 19 Oct 2020 09:50:52 -0700
Subject: [PATCH 4/5] Enable cubin loading in legacy rtc path

---
 include/mxnet/rtc.h |  2 +-
 src/common/rtc.cc   | 37 ++++++++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/include/mxnet/rtc.h b/include/mxnet/rtc.h
index 747c0b5c94ab..56717f4a34c7 100644
--- a/include/mxnet/rtc.h
+++ b/include/mxnet/rtc.h
@@ -60,7 +60,7 @@ class CudaModule {
     /*! \brief nvrtc program handle. */
     nvrtcProgram prog_;
     /*! \brief compiled cuda PTX */
-    char* ptx_;
+    std::vector<char> ptx_;
     /*! \brief lazily loaded cuda module */
     std::unordered_map<int, CUmodule> mod_;
     /*! \brief exported names */
diff --git a/src/common/rtc.cc b/src/common/rtc.cc
index 21d3061e5209..8449e9298e25 100644
--- a/src/common/rtc.cc
+++ b/src/common/rtc.cc
@@ -44,8 +44,8 @@ CudaModule::Chunk::Chunk(
       << "For lower version of CUDA, please prepend your kernel defintiions "
       << "with extern \"C\" instead.";
 #endif
-  std::vector<const char*> c_options;
-  for (const auto& i : options) c_options.push_back(i.c_str());
+  std::vector<const char*> c_options(options.size());
+  for (const auto& i : options) c_options.emplace_back(i.c_str());
   nvrtcResult compile_res = nvrtcCompileProgram(prog_, c_options.size(), c_options.data());
   if (compile_res != NVRTC_SUCCESS) {
     size_t err_size;
@@ -55,10 +55,30 @@ CudaModule::Chunk::Chunk(
     LOG(FATAL) << err.data();
   }
 
-  size_t ptx_size;
-  NVRTC_CALL(nvrtcGetPTXSize(prog_, &ptx_size));
-  ptx_ = new char[ptx_size];
-  NVRTC_CALL(nvrtcGetPTX(prog_, ptx_));
+  bool use_ptx = true;
+  for (const auto& opt : options) {
+    if (opt.find("sm_") != std::string::npos) {
+      use_ptx = false;
+      break;
+    }
+  }
+
+  if (use_ptx) {
+    size_t ptx_size;
+    NVRTC_CALL(nvrtcGetPTXSize(prog_, &ptx_size));
+    ptx_.resize(ptx_size);
+    NVRTC_CALL(nvrtcGetPTX(prog_, ptx_.data()));
+  } else {
+#if CUDA_VERSION >= 11010
+    size_t cubin_size;
+    NVRTC_CALL(nvrtcGetCUBINSize(prog_, &cubin_size));
+    ptx_.resize(ptx_size);
+    NVRTC_CALL(nvrtcGetCUBIN(prog_, ptx_.data()));
+#else
+    LOG(FATAL) << "Your CUDA version does not support compiling for sm_XX target. "
+               << "Use compute_XX target instead or upgrade to CUDA 11.1 or later.";
+#endif
+  }
 }
 
 
@@ -67,7 +87,6 @@ CudaModule::Chunk::~Chunk() {
     CUDA_DRIVER_CALL(cuModuleUnload(kv.second));
   }
   NVRTC_CALL(nvrtcDestroyProgram(&prog_));
-  delete ptx_;
 }
 
 
@@ -83,7 +102,7 @@ CUfunction CudaModule::Chunk::GetFunction(
     module = iter->second;
   } else {
     device_store.SetDevice(ctx.dev_id);
-    CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, ptx_, 0, 0, 0));
+    CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, ptx_.data(), 0, nullptr, nullptr));
     mod_[ctx.dev_id] = module;
   }
   CUfunction function;
@@ -176,7 +195,7 @@ void CudaModule::Kernel::Launch(
         function, grid_dim_x, grid_dim_y, grid_dim_z,
         block_dim_x, block_dim_y, block_dim_z,
         shared_mem, s->stream_,
-        p_args.data(), 0));
+        p_args.data(), nullptr));
     CUDA_CALL(cudaStreamSynchronize(s->stream_));
   }, ctx, read_vars, write_vars, FnProperty::kNormal, 0,
   mangled_name_.c_str());

From 66df3b7749c63e56def9e7f572174fb255c26780 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 2 Nov 2020 12:31:41 -0800
Subject: [PATCH 5/5] Fixes from review

---
 src/common/cuda/rtc.cc | 2 +-
 src/common/rtc.cc      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index b2b5b4d790ce..2284beec11cd 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -98,7 +98,7 @@ std::string GetCompiledCode(nvrtcProgram program, bool use_cubin) {
 
 std::tuple<bool, std::string> GetArchString(const int sm_arch) {
 #if CUDA_VERSION < 10000
-  constexpr int max_supported_sm_arch = 70;
+  constexpr int max_supported_sm_arch = 72;
 #elif CUDA_VERSION < 11000
   constexpr int max_supported_sm_arch = 75;
 #elif CUDA_VERSION < 11010
diff --git a/src/common/rtc.cc b/src/common/rtc.cc
index 8449e9298e25..a2662ce9f59c 100644
--- a/src/common/rtc.cc
+++ b/src/common/rtc.cc
@@ -72,7 +72,7 @@ CudaModule::Chunk::Chunk(
 #if CUDA_VERSION >= 11010
     size_t cubin_size;
     NVRTC_CALL(nvrtcGetCUBINSize(prog_, &cubin_size));
-    ptx_.resize(ptx_size);
+    ptx_.resize(cubin_size);
     NVRTC_CALL(nvrtcGetCUBIN(prog_, ptx_.data()));
 #else
     LOG(FATAL) << "Your CUDA version does not support compiling for sm_XX target. "