apache · tqchen · Aug 1, 2018 · Jul 20, 2018 · Jul 28, 2018 · Jul 30, 2018
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
@@ -34,6 +34,10 @@ std::string CodeGenCUDA::Finish() {
     decl_stream << "#include <cuda_fp16.h>\n";
   }
 
+  if (enable_int8_) {
+    decl_stream << "#include <sm_61_intrinsics.h>\n";
+  }
+
   return CodeGenC::Finish();
 }
 
@@ -81,13 +85,19 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
         os << "unsigned ";
       }
     }
-    if (t.bits() == 8 && t.lanes() == 4) {
-      // directly 4 8 bit int in integer.
-      os << "int"; return;
-    }
     switch (t.bits()) {
       case 8: {
-        if (!t.is_uint() && t.lanes() == 1) {
+        if (t.lanes() == 4) {
+          // directly 4 8 bit int in integer.
+          enable_int8_ = true;
+          os << "char4"; return;
+        } else if (t.lanes() == 8) {
+          enable_int8_ = true;
+          os << "int2"; return;
+        } else if (t.lanes() == 16) {
+          enable_int8_ = true;
+          os << "int4"; return;
+        } else if (!t.is_uint() && t.lanes() == 1) {
           os << "signed char"; break;
         } else {
           os << "char"; break;

diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
@@ -20,7 +20,7 @@ class CodeGenCUDA final : public CodeGenC {
   void Init(bool output_ssa);
   void AddFunction(LoweredFunc f);
   std::string Finish();
-  bool need_include_path() { return enable_fp16_; }
+  bool need_include_path() { return (enable_fp16_ || enable_int8_); }
   // override behavior
   void VisitStmt_(const ir::For* op) final;
   void PrintStorageSync(const Call* op) final;
@@ -49,6 +49,8 @@ class CodeGenCUDA final : public CodeGenC {
   std::string vid_global_barrier_expect_;
   // whether enable fp16
   bool enable_fp16_{false};
+  // whether enable int8
+  bool enable_int8_{false};
 };
 
 }  // namespace codegen

diff --git a/src/codegen/opt/build_cuda_on.cc b/src/codegen/opt/build_cuda_on.cc
@@ -64,7 +64,6 @@ std::string FindCUDAIncludePath() {
 std::string NVRTCCompile(const std::string& code, bool include_path = false) {
   std::vector<std::string> compile_params;
   std::vector<const char*> param_cstrings{};
-  int num_options = 0;
   nvrtcProgram prog;
   cudaDeviceProp device_prop;
   std::string cc = "30";
@@ -78,13 +77,11 @@ std::string NVRTCCompile(const std::string& code, bool include_path = false) {
   }
 
   compile_params.push_back("-arch=compute_" + cc);
-  num_options++;
 
   if (include_path) {
     std::string include_option = "--include-path=" + FindCUDAIncludePath();
 
     compile_params.push_back(include_option);
-    num_options++;
   }
 
   for (const auto& string : compile_params) {

diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
@@ -1,6 +1,7 @@
 import tvm
 import numpy as np
-from tvm.contrib.nvcc import have_fp16
+from tvm.contrib.nvcc import have_fp16, have_int8
+from tvm.contrib import nvcc
 
 def test_cuda_vectorize_add():
     num_thread = 8
@@ -11,6 +12,9 @@ def check_cuda(dtype, n, lanes):
         if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
             print("skip because gpu does not support fp16")
             return
+        if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
+            print("skip because gpu does not support int8")
+            return
         A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
         B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
         s = tvm.create_schedule(B.op)
@@ -27,6 +31,64 @@ def check_cuda(dtype, n, lanes):
 
     check_cuda("float32", 64, 2)
     check_cuda("float16", 64, 2)
+    check_cuda("int8", 64, 4)
+
+
+def test_cuda_multiply_add():
+    num_thread = 8
+    def check_cuda(dtype, n, lanes):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
+            print("skip because gpu does not support int8")
+            return
+        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
+        C = tvm.placeholder((n,), name='C', dtype="int32")        
+        D = tvm.compute((n,),
+                        lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
+        s = tvm.create_schedule(D.op)
+        xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
+        s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
+        s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
+        fun = tvm.build(s, [A, B, C, D], "cuda")
+        np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
+        np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
+        np_c = np.random.randint(low=0, high=127, size=(n,))
+        np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
+        ctx = tvm.gpu(0)
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
+        b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
+        c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
+        d = tvm.nd.empty((n,), D.dtype, ctx)
+        fun(a, b, c, d)
+        np.testing.assert_allclose(d.asnumpy(), np_d)
+    check_cuda("int8", 64, 4)
+
+def test_cuda_vectorize_load():
+    num_thread = 8
+    def check_cuda(dtype, n, lanes):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        ctx = tvm.gpu(0)
+        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = tvm.compute((n,), lambda i: A[i], name='B')
+        s = tvm.create_schedule(B.op)
+        bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
+        s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+        fun = tvm.build(s, [A, B], "cuda", name="vector_load")
+        np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
+        b = tvm.nd.empty((n,), B.dtype, ctx)
+        fun(a,b)
+        np.testing.assert_allclose(a.asnumpy(), b.asnumpy())
+    check_cuda("int8", 64, 8)
+    check_cuda("int8", 64, 16)
 
 if __name__ == "__main__":
     test_cuda_vectorize_add()
+    test_cuda_multiply_add()
+    test_cuda_load_store()