From a17e5c7b5104b766deb539fd357a0ac81a5ec0a0 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 12 Sep 2018 16:18:12 +0800
Subject: [PATCH 1/4] [TOPI] Add dp4a intrinsic to CUDA

---
 topi/python/topi/cuda/int8_intrinsics.py | 54 ++++++++++++++++++++++++
 topi/recipe/gemm/gemm_int8.py            | 38 ++---------------
 2 files changed, 57 insertions(+), 35 deletions(-)
 create mode 100644 topi/python/topi/cuda/int8_intrinsics.py

diff --git a/topi/python/topi/cuda/int8_intrinsics.py b/topi/python/topi/cuda/int8_intrinsics.py
new file mode 100644
index 000000000000..85e7ab3fa410
--- /dev/null
+++ b/topi/python/topi/cuda/int8_intrinsics.py
@@ -0,0 +1,54 @@
+"""Int8 intrinsics on CUDA."""
+#pylint: disable=invalid-name
+import tvm
+
+
+def _intrin_dp4a_reduce(x_scope, y_scope, z_scope):
+    """
+    Int8 dot product reduced by every 4 elements using __dp4a
+
+    Parameters
+    ----------
+    x_scope: The storage scope of buffer for lhs
+    y_scope: The storage scope of buffer for rhs
+    z_scope: The storage scope of buffer for result
+    """
+
+    n = 4  # dp4a requires operands packed by 4
+    x = tvm.placeholder((n,), name='x', dtype='int8')
+    y = tvm.placeholder((n,), name='y', dtype='int8')
+
+    k = tvm.reduce_axis((0, n), name='rc')
+
+    z = tvm.compute((1,), lambda i: tvm.sum(
+        x[k].astype('int32') * y[k].astype('int32'), axis=[k]))
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            xx, yy = ins
+            zz = outs[0]
+
+            if index == 1:
+                return zz.vstore(0, 0)
+
+            ib = tvm.ir_builder.create()
+
+            vec_x = xx.vload(0, dtype='int8x4')
+            vec_y = yy.vload(0, dtype='int8x4')
+            prev_z = 0 if index == 0 else zz.vload(0)
+
+            dp4a = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
+            ib.emit(zz.vstore(0, dp4a))
+
+            return ib.get()
+
+        return _instr(0), _instr(1), _instr(2) # body, reset, update
+
+    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
+        scopes = {x: x_scope, y: y_scope, z: z_scope}
+        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
+                                    data_alignment=cfg.data_alignment,
+                                    offset_factor=cfg.offset_factor,
+                                    scope=scopes[t]) for t in [x, y, z]}
+
+        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
index 4cce2735c4a2..49353f511cf3 100644
--- a/topi/recipe/gemm/gemm_int8.py
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -4,44 +4,12 @@
 import numpy as np
 import tvm
 from tvm import autotvm
+from topi.cuda.int8_intrinsics import _intrin_dp4a_reduce
 
 DO_TUNING = True
 PRETUNED_INDEX = 75333
 
-def intrin_dot():
-    n = 4  # dp4a requires operands packed by 4
-    x = tvm.placeholder((n,), name='x', dtype='int8')
-    y = tvm.placeholder((n,), name='y', dtype='int8')
-    k = tvm.reduce_axis((0, n), name='k')
-
-    z = tvm.compute(
-        (1,), lambda _: tvm.sum(
-            x[k].astype('int32') * y[k].astype('int32'), axis=k))
-
-    def intrin_func(ins, outs):
-        xx, yy = ins
-        zz = outs[0]
-        ib = tvm.ir_builder.create()
-
-        dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a',
-                                                 xx.vload(0, dtype='int8x4'),
-                                                 yy.vload(0, dtype='int8x4'),
-                                                 zz.vload(0)))
-        ib.emit(dp4a)
-
-        body = ib.get()
-        return body, zz.vstore(0, 0), body
-
-    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
-        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
-                                    data_alignment=cfg.data_alignment,
-                                    offset_factor=cfg.offset_factor,
-                                    scope='local') for t in [x, y, z]}
-        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
-
-
-dot = intrin_dot()
-
+intrin_dp4a_reduce = _intrin_dp4a_reduce('local', 'local', 'local')
 
 @autotvm.template
 def gemm_int8(n, m, l):
@@ -70,7 +38,7 @@ def gemm_int8(n, m, l):
 
     ko, kt, ki = cfg['tile_k'].apply(s, CC, k)
 
-    s[CC].tensorize(ki, dot)
+    s[CC].tensorize(ki, intrin_dp4a_reduce)
 
     block_x = tvm.thread_axis('blockIdx.x')
     block_y = tvm.thread_axis('blockIdx.y')

From 603afa35bf497561724763524ed852b757066a68 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 13 Sep 2018 10:05:33 +0800
Subject: [PATCH 2/4] Rename int8_intrinsics.py -> tensor_intrin.py

---
 .../topi/cuda/{int8_intrinsics.py => tensor_intrin.py}      | 4 ++--
 topi/recipe/gemm/gemm_int8.py                               | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)
 rename topi/python/topi/cuda/{int8_intrinsics.py => tensor_intrin.py} (95%)

diff --git a/topi/python/topi/cuda/int8_intrinsics.py b/topi/python/topi/cuda/tensor_intrin.py
similarity index 95%
rename from topi/python/topi/cuda/int8_intrinsics.py
rename to topi/python/topi/cuda/tensor_intrin.py
index 85e7ab3fa410..4ee65876c452 100644
--- a/topi/python/topi/cuda/int8_intrinsics.py
+++ b/topi/python/topi/cuda/tensor_intrin.py
@@ -1,9 +1,9 @@
-"""Int8 intrinsics on CUDA."""
+"""Tensor intrinsics on CUDA."""
 #pylint: disable=invalid-name
 import tvm
 
 
-def _intrin_dp4a_reduce(x_scope, y_scope, z_scope):
+def dp4a(x_scope, y_scope, z_scope):
     """
     Int8 dot product reduced by every 4 elements using __dp4a
 
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
index 49353f511cf3..ed735dad9cd9 100644
--- a/topi/recipe/gemm/gemm_int8.py
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -4,12 +4,12 @@
 import numpy as np
 import tvm
 from tvm import autotvm
-from topi.cuda.int8_intrinsics import _intrin_dp4a_reduce
+from topi.cuda.tensor_intrin import dp4a
 
 DO_TUNING = True
 PRETUNED_INDEX = 75333
 
-intrin_dp4a_reduce = _intrin_dp4a_reduce('local', 'local', 'local')
+intrin_dp4a = dp4a('local', 'local', 'local')
 
 @autotvm.template
 def gemm_int8(n, m, l):
@@ -38,7 +38,7 @@ def gemm_int8(n, m, l):
 
     ko, kt, ki = cfg['tile_k'].apply(s, CC, k)
 
-    s[CC].tensorize(ki, intrin_dp4a_reduce)
+    s[CC].tensorize(ki, intrin_dp4a)
 
     block_x = tvm.thread_axis('blockIdx.x')
     block_y = tvm.thread_axis('blockIdx.y')

From 77f65600bd7688894a9cbb26ffb1e480f39a14d8 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 13 Sep 2018 10:11:17 +0800
Subject: [PATCH 3/4] Rename variable to fix lint

---
 topi/python/topi/cuda/tensor_intrin.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/tensor_intrin.py b/topi/python/topi/cuda/tensor_intrin.py
index 4ee65876c452..c398175e81b8 100644
--- a/topi/python/topi/cuda/tensor_intrin.py
+++ b/topi/python/topi/cuda/tensor_intrin.py
@@ -37,8 +37,8 @@ def _instr(index):
             vec_y = yy.vload(0, dtype='int8x4')
             prev_z = 0 if index == 0 else zz.vload(0)
 
-            dp4a = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
-            ib.emit(zz.vstore(0, dp4a))
+            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
+            ib.emit(zz.vstore(0, new_z))
 
             return ib.get()
 

From 23c65be2c7ff55c6fbb6a5045ae9f875c4f89a2f Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 14 Sep 2018 10:14:52 +0800
Subject: [PATCH 4/4] Improve doc

---
 topi/python/topi/cuda/tensor_intrin.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/topi/python/topi/cuda/tensor_intrin.py b/topi/python/topi/cuda/tensor_intrin.py
index c398175e81b8..26ae7587c5df 100644
--- a/topi/python/topi/cuda/tensor_intrin.py
+++ b/topi/python/topi/cuda/tensor_intrin.py
@@ -3,15 +3,23 @@
 import tvm
 
 
-def dp4a(x_scope, y_scope, z_scope):
+def dp4a(x_scope='local', y_scope='local', z_scope='local'):
     """
     Int8 dot product reduced by every 4 elements using __dp4a
 
     Parameters
     ----------
-    x_scope: The storage scope of buffer for lhs
-    y_scope: The storage scope of buffer for rhs
-    z_scope: The storage scope of buffer for result
+    x_scope : str, optional
+        The storage scope of buffer for lhs
+    y_scope : str, optional
+        The storage scope of buffer for rhs
+    z_scope : str, optional
+        The storage scope of buffer for result
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The dp4a TensorIntrin that can be used in tensorizing schedule.
     """
 
     n = 4  # dp4a requires operands packed by 4