From a17e5c7b5104b766deb539fd357a0ac81a5ec0a0 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 12 Sep 2018 16:18:12 +0800 Subject: [PATCH 1/4] [TOPI] Add dp4a intrinsic to CUDA --- topi/python/topi/cuda/int8_intrinsics.py | 54 ++++++++++++++++++++++++ topi/recipe/gemm/gemm_int8.py | 38 ++--------------- 2 files changed, 57 insertions(+), 35 deletions(-) create mode 100644 topi/python/topi/cuda/int8_intrinsics.py diff --git a/topi/python/topi/cuda/int8_intrinsics.py b/topi/python/topi/cuda/int8_intrinsics.py new file mode 100644 index 000000000000..85e7ab3fa410 --- /dev/null +++ b/topi/python/topi/cuda/int8_intrinsics.py @@ -0,0 +1,54 @@ +"""Int8 intrinsics on CUDA.""" +#pylint: disable=invalid-name +import tvm + + +def _intrin_dp4a_reduce(x_scope, y_scope, z_scope): + """ + Int8 dot product reduced by every 4 elements using __dp4a + + Parameters + ---------- + x_scope: The storage scope of buffer for lhs + y_scope: The storage scope of buffer for rhs + z_scope: The storage scope of buffer for result + """ + + n = 4 # dp4a requires operands packed by 4 + x = tvm.placeholder((n,), name='x', dtype='int8') + y = tvm.placeholder((n,), name='y', dtype='int8') + + k = tvm.reduce_axis((0, n), name='rc') + + z = tvm.compute((1,), lambda i: tvm.sum( + x[k].astype('int32') * y[k].astype('int32'), axis=[k])) + + def _intrin_func(ins, outs): + def _instr(index): + xx, yy = ins + zz = outs[0] + + if index == 1: + return zz.vstore(0, 0) + + ib = tvm.ir_builder.create() + + vec_x = xx.vload(0, dtype='int8x4') + vec_y = yy.vload(0, dtype='int8x4') + prev_z = 0 if index == 0 else zz.vload(0) + + dp4a = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) + ib.emit(zz.vstore(0, dp4a)) + + return ib.get() + + return _instr(0), _instr(1), _instr(2) # body, reset, update + + with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: + scopes = {x: x_scope, y: y_scope, z: z_scope} + binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, + data_alignment=cfg.data_alignment, + offset_factor=cfg.offset_factor, + scope=scopes[t]) for t in [x, y, z]} + + return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds) diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py index 4cce2735c4a2..49353f511cf3 100644 --- a/topi/recipe/gemm/gemm_int8.py +++ b/topi/recipe/gemm/gemm_int8.py @@ -4,44 +4,12 @@ import numpy as np import tvm from tvm import autotvm +from topi.cuda.int8_intrinsics import _intrin_dp4a_reduce DO_TUNING = True PRETUNED_INDEX = 75333 -def intrin_dot(): - n = 4 # dp4a requires operands packed by 4 - x = tvm.placeholder((n,), name='x', dtype='int8') - y = tvm.placeholder((n,), name='y', dtype='int8') - k = tvm.reduce_axis((0, n), name='k') - - z = tvm.compute( - (1,), lambda _: tvm.sum( - x[k].astype('int32') * y[k].astype('int32'), axis=k)) - - def intrin_func(ins, outs): - xx, yy = ins - zz = outs[0] - ib = tvm.ir_builder.create() - - dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a', - xx.vload(0, dtype='int8x4'), - yy.vload(0, dtype='int8x4'), - zz.vload(0))) - ib.emit(dp4a) - - body = ib.get() - return body, zz.vstore(0, 0), body - - with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: - binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, - data_alignment=cfg.data_alignment, - offset_factor=cfg.offset_factor, - scope='local') for t in [x, y, z]} - return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds) - - -dot = intrin_dot() - +intrin_dp4a_reduce = _intrin_dp4a_reduce('local', 'local', 'local') @autotvm.template def gemm_int8(n, m, l): @@ -70,7 +38,7 @@ def gemm_int8(n, m, l): ko, kt, ki = cfg['tile_k'].apply(s, CC, k) - s[CC].tensorize(ki, dot) + s[CC].tensorize(ki, intrin_dp4a_reduce) block_x = tvm.thread_axis('blockIdx.x') block_y = tvm.thread_axis('blockIdx.y') From 603afa35bf497561724763524ed852b757066a68 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 13 Sep 2018 10:05:33 +0800 Subject: [PATCH 2/4] Rename int8_intrinsics.py -> tensor_intrin.py --- .../topi/cuda/{int8_intrinsics.py => tensor_intrin.py} | 4 ++-- topi/recipe/gemm/gemm_int8.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) rename topi/python/topi/cuda/{int8_intrinsics.py => tensor_intrin.py} (95%) diff --git a/topi/python/topi/cuda/int8_intrinsics.py b/topi/python/topi/cuda/tensor_intrin.py similarity index 95% rename from topi/python/topi/cuda/int8_intrinsics.py rename to topi/python/topi/cuda/tensor_intrin.py index 85e7ab3fa410..4ee65876c452 100644 --- a/topi/python/topi/cuda/int8_intrinsics.py +++ b/topi/python/topi/cuda/tensor_intrin.py @@ -1,9 +1,9 @@ -"""Int8 intrinsics on CUDA.""" +"""Tensor intrinsics on CUDA.""" #pylint: disable=invalid-name import tvm -def _intrin_dp4a_reduce(x_scope, y_scope, z_scope): +def dp4a(x_scope, y_scope, z_scope): """ Int8 dot product reduced by every 4 elements using __dp4a diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py index 49353f511cf3..ed735dad9cd9 100644 --- a/topi/recipe/gemm/gemm_int8.py +++ b/topi/recipe/gemm/gemm_int8.py @@ -4,12 +4,12 @@ import numpy as np import tvm from tvm import autotvm -from topi.cuda.int8_intrinsics import _intrin_dp4a_reduce +from topi.cuda.tensor_intrin import dp4a DO_TUNING = True PRETUNED_INDEX = 75333 -intrin_dp4a_reduce = _intrin_dp4a_reduce('local', 'local', 'local') +intrin_dp4a = dp4a('local', 'local', 'local') @autotvm.template def gemm_int8(n, m, l): @@ -38,7 +38,7 @@ def gemm_int8(n, m, l): ko, kt, ki = cfg['tile_k'].apply(s, CC, k) - s[CC].tensorize(ki, intrin_dp4a_reduce) + s[CC].tensorize(ki, intrin_dp4a) block_x = tvm.thread_axis('blockIdx.x') block_y = tvm.thread_axis('blockIdx.y') From 77f65600bd7688894a9cbb26ffb1e480f39a14d8 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 13 Sep 2018 10:11:17 +0800 Subject: [PATCH 3/4] Rename variable to fix lint --- topi/python/topi/cuda/tensor_intrin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/cuda/tensor_intrin.py b/topi/python/topi/cuda/tensor_intrin.py index 4ee65876c452..c398175e81b8 100644 --- a/topi/python/topi/cuda/tensor_intrin.py +++ b/topi/python/topi/cuda/tensor_intrin.py @@ -37,8 +37,8 @@ def _instr(index): vec_y = yy.vload(0, dtype='int8x4') prev_z = 0 if index == 0 else zz.vload(0) - dp4a = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) - ib.emit(zz.vstore(0, dp4a)) + new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) + ib.emit(zz.vstore(0, new_z)) return ib.get() From 23c65be2c7ff55c6fbb6a5045ae9f875c4f89a2f Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 14 Sep 2018 10:14:52 +0800 Subject: [PATCH 4/4] Improve doc --- topi/python/topi/cuda/tensor_intrin.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/topi/python/topi/cuda/tensor_intrin.py b/topi/python/topi/cuda/tensor_intrin.py index c398175e81b8..26ae7587c5df 100644 --- a/topi/python/topi/cuda/tensor_intrin.py +++ b/topi/python/topi/cuda/tensor_intrin.py @@ -3,15 +3,23 @@ import tvm -def dp4a(x_scope, y_scope, z_scope): +def dp4a(x_scope='local', y_scope='local', z_scope='local'): """ Int8 dot product reduced by every 4 elements using __dp4a Parameters ---------- - x_scope: The storage scope of buffer for lhs - y_scope: The storage scope of buffer for rhs - z_scope: The storage scope of buffer for result + x_scope : str, optional + The storage scope of buffer for lhs + y_scope : str, optional + The storage scope of buffer for rhs + z_scope : str, optional + The storage scope of buffer for result + + Returns + ------- + intrin : TensorIntrin + The dp4a TensorIntrin that can be used in tensorizing schedule. """ n = 4 # dp4a requires operands packed by 4