From abef4957038138e617721bbe44458a4d508554e7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 15 Oct 2017 13:37:09 -0700
Subject: [PATCH 1/3] [CODEGEN] Allow link additional module

---
 python/tvm/__init__.py               |  4 +-
 python/tvm/contrib/nvcc.py           | 60 ++++++++++++++++++++++++++++
 python/tvm/contrib/rocm.py           |  1 +
 src/codegen/llvm/codegen_llvm.cc     | 11 +++++
 src/codegen/llvm/codegen_llvm.h      |  8 +++-
 src/codegen/llvm/codegen_nvptx.cc    | 22 +++++++++-
 src/codegen/llvm/intrin_rule_llvm.cc | 42 +------------------
 src/codegen/llvm/intrin_rule_llvm.h  | 56 ++++++++++++++++++++++++++
 src/codegen/llvm/llvm_common.h       |  2 +
 9 files changed, 162 insertions(+), 44 deletions(-)
 create mode 100644 src/codegen/llvm/intrin_rule_llvm.h

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 90ac45988bcb..e23eed7168dc 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -30,4 +30,6 @@
 from .schedule import create_schedule
 from .build_module import build, lower, build_config
 from .tag import tag_scope
-from .contrib import rocm as _rocm
+
+# Contrib initializers
+from .contrib import rocm as _rocm, nvcc as _nvcc
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 9651466b723d..3242619c30f8 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -3,8 +3,12 @@
 from __future__ import absolute_import as _abs
 
 import subprocess
+import os
+import warnings
 from . import util
 from .. import ndarray as nd
+from ..api import register_func
+
 
 def compile_cuda(code,
                  target="ptx",
@@ -72,3 +76,59 @@ def compile_cuda(code,
         raise RuntimeError(msg)
 
     return bytearray(open(file_target, "rb").read())
+
+
+def find_cuda_path():
+    """Utility function to find cuda path
+
+    Returns
+    -------
+    path : str
+        Path to cuda root.
+    """
+    if "CUDA_PATH" in os.environ:
+        return os.environ["CUDA_PATH"]
+    cmd = ["which", "nvcc"]
+    proc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    (out, _) = proc.communicate()
+    if proc.returncode == 0:
+        return os.path.abspath(os.path.join(out.strip(), "../.."))
+    cuda_path = "/usr/local/cuda"
+    if os.path.exists(os.path.join(cuda_path, "bin/nvcc")):
+        return cuda_path
+    raise RuntimeError("Cannot find cuda path")
+
+
+def find_libdevice_path(arch):
+    """Utility function to find libdevice
+
+    Parameters
+    ----------
+    arch : int
+        The compute architecture in int
+    """
+    cuda_path = find_cuda_path()
+    lib_path = os.path.join(cuda_path, "nvvm/libdevice")
+    selected_ver = 0
+    selected_path = None
+
+    for fn in os.listdir(lib_path):
+        if not fn.startswith("libdevice"):
+            continue
+        ver = int(fn.split(".")[-3].split("_")[-1])
+        if ver > selected_ver and ver <= arch:
+            selected_ver = ver
+            selected_path = fn
+    if selected_path is None:
+        raise RuntimeError("Cannot find libdevice for arch {}".format(arch))
+    return os.path.join(lib_path, selected_path)
+
+
+@register_func("tvm_callback_libdevice_path")
+def callback_libdevice_path(arch):
+    try:
+        return find_libdevice_path(arch)
+    except RuntimeError:
+        warnings.warn("Cannot find libdevice path")
+        return ""
diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index c367aef24e21..ee956c85e0ed 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -26,6 +26,7 @@ def rocm_link(in_file, out_file):
         msg += str(out)
         raise RuntimeError(msg)
 
+
 @register_func("tvm_callback_rocm_link")
 def callback_rocm_link(obj_bin):
     """Links object file generated from LLVM to HSA Code Object
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 299e2d8483b0..cb2eae40eaeb 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -147,10 +147,21 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) {
 
 std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {
   this->AddStartupFunction();
+  // link modules
+  for (size_t i = 0; i < link_modules_.size(); ++i) {
+    CHECK(!llvm::Linker::linkModules(*module_, std::move(link_modules_[i])))
+        << "Failed to link modules";
+  }
+  link_modules_.clear();
+  // optimize
   this->Optimize();
   return std::move(module_);
 }
 
+void CodeGenLLVM::AddLinkModule(std::unique_ptr<llvm::Module>&& mod) {
+  link_modules_.emplace_back(std::move(mod));
+}
+
 void CodeGenLLVM::AddMainFunction(const std::string& entry_func_name) {
   LOG(FATAL) << "not implemented";
 }
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index 631c42f7b226..e4a0b24d381a 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -66,6 +66,11 @@ class CodeGenLLVM :
    * \return the created module.
    */
   virtual std::unique_ptr<llvm::Module> Finish();
+  /*!
+   * \brief Add mod to be linked with the generated module
+   * \param mod The module to be linked.
+   */
+  void AddLinkModule(std::unique_ptr<llvm::Module>&& mod);
   /*!
    * \brief Create Value for expression e
    * \param e The expression to be created value for.
@@ -227,7 +232,8 @@ class CodeGenLLVM :
   llvm::MDNode* md_very_likely_branch_{nullptr};
   llvm::MDNode* md_tbaa_root_{nullptr};
   llvm::MDNode* md_tbaa_alias_set_{nullptr};
-
+  // modules to be linked.
+  std::vector<std::unique_ptr<llvm::Module> > link_modules_;
   /*! \brief native vector bits of current targetx*/
   int native_vector_bits_{0};
   /*! \brief the storage scope of allocation */
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index ede882895f95..d147709ff1a2 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -153,9 +153,10 @@ inline int DetectCUDAComputeVersion() {
 runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
   CHECK(target.length() >= 5 &&
         target.substr(0, 5) == "nvptx");
+  int compute_ver = DetectCUDAComputeVersion();
   std::ostringstream config;
   config << "-mtriple=nvptx64-nvidia-cuda -mcpu=sm_"
-         << DetectCUDAComputeVersion()
+         << compute_ver
          << target.substr(5, target.length() - 5);
   llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
   std::unique_ptr<CodeGenNVPTX> cg(new CodeGenNVPTX());
@@ -164,6 +165,25 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
   for (LoweredFunc f :  funcs) {
     cg->AddFunction(f);
   }
+
+  const auto* flibdevice_path =
+      tvm::runtime::Registry::Get("tvm_callback_libdevice_path");
+  if (flibdevice_path != nullptr) {
+    std::string path = (*flibdevice_path)(compute_ver);
+    if (path.length() != 0) {
+      llvm::SMDiagnostic err;
+      std::unique_ptr<llvm::Module> mlib = llvm::parseIRFile(path, err, *ctx);
+      if (mlib.get() == nullptr) {
+        std::string msg = err.getMessage();
+        LOG(FATAL) << "Fail to load bitcode file " << path << "\n"
+                   << "line " << err.getLineNo() << ":" << msg;
+      }
+      mlib->setTargetTriple(tm->getTargetTriple().str());
+      mlib->setDataLayout(tm->createDataLayout());
+      // TODO(tqchen) libdevice linking not yet working.
+      // cg->AddLinkModule(std::move(mlib));
+    }
+  }
   std::unique_ptr<llvm::Module> module = cg->Finish();
   llvm::SmallString<8> data_ptx, data_ll;
   llvm::raw_svector_ostream dest_ptx(data_ptx), dest_ll(data_ll);
diff --git a/src/codegen/llvm/intrin_rule_llvm.cc b/src/codegen/llvm/intrin_rule_llvm.cc
index dd5ce9847b40..63bcca0985a0 100644
--- a/src/codegen/llvm/intrin_rule_llvm.cc
+++ b/src/codegen/llvm/intrin_rule_llvm.cc
@@ -4,52 +4,12 @@
  */
 #ifdef TVM_LLVM_VERSION
 
-#include <tvm/ir.h>
-#include <tvm/api_registry.h>
-#include <tvm/codegen.h>
-#include <string>
-#include "./llvm_common.h"
+#include "./intrin_rule_llvm.h"
 
 namespace tvm {
 namespace codegen {
 namespace llvm {
 
-using namespace ir;
-
-// num_signature means number of arguments used to query signature
-template<unsigned id, int num_signature>
-inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
-  Expr e = targs[0];
-  const Call* call = e.as<Call>();
-  CHECK(call != nullptr);
-  Array<Expr> cargs;
-  // intrin id.
-  cargs.push_back(UIntImm::make(UInt(32), id));
-  cargs.push_back(UIntImm::make(UInt(32), num_signature));
-
-  for (Expr arg : call->args) {
-    cargs.push_back(arg);
-  }
-  *rv = Call::make(
-      call->type, "llvm_intrin", cargs, Call::PureIntrinsic);
-}
-
-template<unsigned id, int num_signature>
-inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
-  Expr e = targs[0];
-  const Call* call = e.as<Call>();
-  CHECK(call != nullptr);
-  Array<Expr> cargs;
-  // intrin id.
-  cargs.push_back(UIntImm::make(UInt(32), id));
-  cargs.push_back(UIntImm::make(UInt(32), num_signature));
-  for (Expr arg : call->args) {
-    cargs.push_back(arg);
-  }
-  *rv = Call::make(
-      call->type, "llvm_intrin", cargs, Call::Intrinsic);
-}
-
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.prefetch")
 .set_body(DispatchLLVMIntrin<::llvm::Intrinsic::prefetch, 0>);
 
diff --git a/src/codegen/llvm/intrin_rule_llvm.h b/src/codegen/llvm/intrin_rule_llvm.h
new file mode 100644
index 000000000000..85641cb178e7
--- /dev/null
+++ b/src/codegen/llvm/intrin_rule_llvm.h
@@ -0,0 +1,56 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file intrin_rule_llvm.h
+ * \brief Common utilities for llvm intrinsics.
+ */
+#ifndef TVM_CODEGEN_LLVM_INTRIN_RULE_LLVM_H_
+#define TVM_CODEGEN_LLVM_INTRIN_RULE_LLVM_H_
+#ifdef TVM_LLVM_VERSION
+
+#include <tvm/ir.h>
+#include <tvm/api_registry.h>
+#include <tvm/codegen.h>
+#include <string>
+#include "./llvm_common.h"
+
+namespace tvm {
+namespace codegen {
+// num_signature means number of arguments used to query signature
+template<unsigned id, int num_signature>
+inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
+  Expr e = targs[0];
+  const ir::Call* call = e.as<ir::Call>();
+  CHECK(call != nullptr);
+  Array<Expr> cargs;
+  // intrin id.
+  cargs.push_back(ir::UIntImm::make(UInt(32), id));
+  cargs.push_back(ir::UIntImm::make(UInt(32), num_signature));
+
+  for (Expr arg : call->args) {
+    cargs.push_back(arg);
+  }
+  *rv = ir::Call::make(
+      call->type, "llvm_intrin", cargs, ir::Call::PureIntrinsic);
+}
+
+template<unsigned id, int num_signature>
+inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
+  Expr e = targs[0];
+  const ir::Call* call = e.as<ir::Call>();
+  CHECK(call != nullptr);
+  Array<Expr> cargs;
+  // intrin id.
+  cargs.push_back(ir::UIntImm::make(UInt(32), id));
+  cargs.push_back(ir::UIntImm::make(UInt(32), num_signature));
+  for (Expr arg : call->args) {
+    cargs.push_back(arg);
+  }
+  *rv = ir::Call::make(
+      call->type, "llvm_intrin", cargs, ir::Call::Intrinsic);
+}
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // LLVM_VERSION
+#endif  // TVM_CODEGEN_LLVM_INTRIN_RULE_LLVM_H_
diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index da905f4709e0..11ff66d8ca38 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -43,6 +43,8 @@
 #include <llvm/IRReader/IRReader.h>
 #include <llvm/CodeGen/TargetLoweringObjectFileImpl.h>
 
+#include <llvm/Linker/Linker.h>
+
 #include <utility>
 #include <string>
 

From 962aa9a5bfa119cf1cb58d285ef3cb0c6a68fbe8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 15 Oct 2017 13:51:21 -0700
Subject: [PATCH 2/3] fix py3

---
 python/tvm/contrib/nvcc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 3242619c30f8..1267b14fc46d 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -8,7 +8,7 @@
 from . import util
 from .. import ndarray as nd
 from ..api import register_func
-
+from .._ffi.base import py_str
 
 def compile_cuda(code,
                  target="ptx",
@@ -92,8 +92,9 @@ def find_cuda_path():
     proc = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
+    out = py_str(out)
     if proc.returncode == 0:
-        return os.path.abspath(os.path.join(out.strip(), "../.."))
+        return os.path.abspath(os.path.join(str(out).strip(), "../.."))
     cuda_path = "/usr/local/cuda"
     if os.path.exists(os.path.join(cuda_path, "bin/nvcc")):
         return cuda_path
@@ -125,7 +126,6 @@ def find_libdevice_path(arch):
     return os.path.join(lib_path, selected_path)
 
 
-@register_func("tvm_callback_libdevice_path")
 def callback_libdevice_path(arch):
     try:
         return find_libdevice_path(arch)

From 610b4ca7fd319deed9008dab0b8b825b10bb4306 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 15 Oct 2017 13:59:47 -0700
Subject: [PATCH 3/3] add register back

---
 python/tvm/contrib/nvcc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 1267b14fc46d..ac8dbf65b2bc 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -101,6 +101,7 @@ def find_cuda_path():
     raise RuntimeError("Cannot find cuda path")
 
 
+@register_func("tvm_callback_libdevice_path")
 def find_libdevice_path(arch):
     """Utility function to find libdevice