From 2bab6c8a856d2d8363ceaf60f3aba05baabe6ead Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 14 Sep 2018 15:39:59 +0800
Subject: [PATCH 01/16] Add int8 conv2d NCHWc template

---
 topi/python/topi/cuda/conv2d.py      |  17 +-
 topi/python/topi/cuda/conv2d_int8.py | 279 +++++++++++++++++++++++++++
 2 files changed, 292 insertions(+), 4 deletions(-)
 create mode 100644 topi/python/topi/cuda/conv2d_int8.py

diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index a7d5f742d98c..980be6e634d5 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -9,9 +9,10 @@
 
 from .conv2d_direct import schedule_direct_cuda
 from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
+from .conv2d_int8 import conv2d_int8_NCHWc, schedule_conv2d_int8_NCHWc
 
 
-@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd'])
+@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
 def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for cuda backend.
 
@@ -21,10 +22,13 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
         The config for this template
 
     data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]
 
     kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
 
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -98,7 +102,10 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
     if cfg.template_key == 'winograd':
         return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype,
                              pre_computed=False)
+    if cfg.template_key == 'int8':
+        return conv2d_int8_NCHWc(cfg, data, kernel, strides, padding, layout, out_dtype)
 
+    print(cfg.template_key)
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
     elif layout == 'HWCN':
@@ -108,7 +115,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
 
 
 @autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
-                                ["direct", 'winograd'])
+                                ["direct", 'winograd', "int8"])
 def schedule_conv2d_nchw_cuda(cfg, outs):
     """TOPI schedule callback of conv2d for cuda gpu
 
@@ -138,6 +145,8 @@ def _callback(op):
             schedule_direct_cuda(cfg, s, op.output(0))
         if op.tag == 'conv2d_nchw_winograd':
             schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
+        if op.tag == "conv2d_int8_NCHWc":
+            schedule_conv2d_int8_NCHWc(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
new file mode 100644
index 000000000000..43626c60b78c
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -0,0 +1,279 @@
+# pylint: disable=invalid-name
+"""Int8 conv2d in NCHWc layout"""
+import tvm
+from tvm import autotvm
+
+from .tensor_intrin import dp4a
+from ..nn.pad import pad
+from ..nn.util import get_pad_tuple
+from ..util import get_const_tuple, get_const_int
+
+
+def _conv2d_int8_NCHWc_arg_to_workload(data, kernel, stride, padding, layout, out_dtype):
+    """convert argument to workload"""
+    shape = get_const_tuple(data.shape)
+    if len(shape) == 5:
+        N, ic_chunk, H, W, ic_block = shape
+        raw_data = tvm.placeholder(
+            (N, ic_chunk*ic_block, H, W), dtype=data.dtype)
+    else:
+        raw_data = data
+
+    shape = get_const_tuple(kernel.shape)
+    if len(shape) == 6:
+        oc_chunk, ic_chunk, KH, KW, oc_block, ic_block = shape
+        raw_kernel = tvm.placeholder(
+            (oc_chunk, ic_chunk, KH, KW, oc_block, ic_block), dtype=kernel.dtype)
+    else:
+        raw_kernel = kernel
+
+    return ('conv2d', ) + autotvm.task.task.args_to_workload(
+        [raw_data, raw_kernel, stride, padding, out_dtype])
+
+
+def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype):
+    """Convolution operator in NCHW[x]c layout for int8.
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
+    """
+    assert layout == 'NCHW'
+
+    ic_block_factor = 4
+    oc_block_factor = 4
+
+    if len(data.shape) == 4:
+        batch, channels, height, width = get_const_tuple(data.shape)
+        assert channels % ic_block_factor == 0, \
+            "Number of input channels should be multiple of {}".format(
+                ic_block_factor)
+        packed_data = tvm.compute((batch, channels/ic_block_factor, height, width, ic_block_factor),
+                                  lambda n, c, h, w, vc: kernel[n,
+                                                                c*ic_block_factor + vc, h, w],
+                                  name="packed_data")
+    else:
+        packed_data = data
+
+    if len(kernel.shape) == 4:
+        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
+            kernel.shape)
+        assert out_channels % 4 == 0, \
+            "Number of output channels should be multiple of {}".format(
+                oc_block_factor)
+        packed_kernel = tvm.compute(
+            (out_channels / oc_block_factor, in_channels / ic_block_factor, kernel_h, kernel_w,
+             oc_block_factor, ic_block_factor),
+            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
+            kernel[oc_chunk * oc_block_factor + oc_block,
+                   ic_chunk * ic_block_factor + ic_block, kh, kw],
+            name="packed_kernel")
+    else:
+        packed_kernel = kernel
+
+    batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(
+        packed_data.shape)
+    oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
+        packed_kernel.shape)
+
+    stride_h, stride_w = (stride, stride) if isinstance(
+        stride, int) else stride
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
+
+    # compute the output shape
+    out_height = (in_height - kernel_h + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - kernel_w + pad_left + pad_right) // stride_w + 1
+
+    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
+
+    icc = tvm.reduce_axis((0, ic_chunk), name='ic_chunk')
+    icb = tvm.reduce_axis((0, ic_block), name='ic_block')
+    kh = tvm.reduce_axis((0, kernel_h), name='kh')
+    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(pad_data[n, icc, oh*stride_h+kh, ow*stride_w+kw, icb]
+                               .astype('int32') *
+                               packed_kernel[oc_chunk, icc,
+                                             kh, kw, oc_block, icb]
+                               .astype('int32'),
+                               axis=[icc, kh, kw, icb]))
+
+    output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                         conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
+                         tag="conv2d_int8_NCHWc",
+                         attrs={"workload": _conv2d_int8_NCHWc_arg_to_workload(
+                             data, kernel, stride, padding, layout, out_dtype)})
+
+    # num flop
+    num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
+        ic_chunk * ic_block * kernel_h * kernel_w * 2
+    cfg.add_flop(num_flop)
+
+    return output
+
+
+_dp4a = dp4a('shared', 'shared', 'local')
+
+
+def schedule_conv2d_int8_NCHWc(cfg, s, output):
+    """Schedule conv2d int8 NCHWc template"""
+    workload = output.op.attrs["workload"]
+
+    stride = workload[3]
+
+    conv = output.op.input_tensors[0]
+    packed_data, packed_kernel = conv.op.input_tensors
+
+    if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag:
+        pad_data = packed_data
+        packed_data = pad_data.op.input_tensors[0]
+    else:
+        pad_data = packed_data
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # skip this part during tuning to make recrods accurate
+        # this part will be pre-computed during NNVM's pre-compute optimization pass
+        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+        s[packed_kernel].pragma(
+            s[packed_kernel].op.axis[0], "debug_skip_region")
+    else:
+        if isinstance(packed_data.op, tvm.tensor.ComputeOp):
+            s[packed_data].compute_inline()
+        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp):
+            s[packed_kernel].compute_inline()
+
+    if pad_data != packed_data:
+        s[pad_data].compute_inline()
+
+    s[conv].set_scope('local')
+
+    batch = get_const_int(packed_data.shape[0])
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [conv])
+    WW = s.cache_read(packed_kernel, 'shared', [conv])
+
+    # tile and bind spatial axes
+    n, f, y, x, c = s[output].op.axis
+    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    max_block_z = 128
+    if batch > max_block_z:
+        _, n = s[output].split(n, factor=max_block_z)
+    s[output].bind(n, tvm.thread_axis("blockIdx.z"))
+
+    s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+
+    s[conv].compute_at(s[output], tx)
+
+    # tile and bind reduction axes
+    n, f, y, x, c = s[conv].op.axis
+
+    rc, ry, rx, rc_block = s[conv].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
+    rco, rci = cfg['tile_rc'].apply(s, conv, rc)
+    ryo, ryi = cfg['tile_ry'].apply(s, conv, ry)
+    rxo, rxi = cfg['tile_rx'].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
+
+    _, rc_block = s[conv].split(rc_block, factor=4)
+    s[conv].tensorize(rc_block, _dp4a)
+
+    s[AA].compute_at(s[conv], rxo)
+    s[WW].compute_at(s[conv], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        if load == AA:
+            n, f, y, x, c = s[load].op.axis
+            if pad_data != packed_data and stride_h == 1 and stride_w == 1:
+                s[load].vectorize(c)
+                fused = s[load].fuse(n, f, y, x)
+            else:
+                c, _ = s[load].split(c, factor=4)
+                fused = s[load].fuse(n, f, y, x, c)
+        else:
+            n, f, y, x, oc_chunk, c = s[load].op.axis
+            fused = s[load].fuse(n, f, y, x, oc_chunk)
+            s[load].vectorize(c)
+
+        fused, tx = s[load].split(fused, factor=cfg["tile_x"].size[2])
+        fused, ty = s[load].split(fused, factor=cfg["tile_y"].size[2])
+        fused, tz = s[load].split(fused, factor=cfg["tile_f"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # double buffer
+    for load in [AA, WW]:
+        name = load.op.name + '_double_buffer'
+        cfg.define_knob(name, [0, 1])
+
+        if cfg[name].val:
+            s[load].double_buffer()
+
+    # unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
+                     cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', False)
+
+    return s

From 2fd0a15f3081e651764e7ca6b4d3cd9aa8f4f32d Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 19 Sep 2018 18:51:28 +0800
Subject: [PATCH 02/16] Add NNVM symbol of conv2d NCHWc int8

---
 nnvm/python/nnvm/top/nn.py               |  31 +++++++
 nnvm/src/top/nn/convolution.cc           |  16 ++++
 topi/python/topi/cuda/conv2d.py          |  10 +--
 topi/python/topi/cuda/conv2d_int8.py     | 104 ++++++++++++++++-------
 topi/python/topi/cuda/conv2d_winograd.py |   6 ++
 topi/python/topi/generic/nn.py           |  18 ++++
 topi/python/topi/nn/conv2d.py            |  34 ++++++++
 7 files changed, 184 insertions(+), 35 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index b452738123c3..cee0d7741977 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -196,7 +196,38 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
 
 reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
 
+@reg.register_compute("_contrib_conv2d_NCHWc_int8_prepacked")
+def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _):
+    """Compute definition of int8 conv2d NCHWc"""
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.get_string("layout")
+    out_dtype = attrs.get_string("out_dtype")
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+    assert dilation == (1, 1), "not support dilate now"
+    if groups == 1:
+        # pylint: disable=assignment-from-no-return
+        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout, out_dtype)
+        # pylint: enable=assignment-from-no-return
+    else:
+        raise ValueError("not support arbitrary group number > 1 for now")
+    if attrs.get_bool("use_bias"):
+        bias = inputs[2]
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+        out = topi.add(out, bias)
+    return out
+ 
+@reg.register_schedule("_contrib_conv2d_NCHWc_int8_prepacked")
+def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target):
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs)
+
+reg.register_pattern("_contrib_conv2d_NCHWc_int8", OpPattern.OUT_ELEMWISE_FUSABLE)
+
 
+    
 @reg.register_compute("_contrib_conv2d_winograd_weight_transform")
 def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _):
     return topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size'))
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index d5c9c18f68a6..0a6e2d09b73f 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -344,6 +344,22 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2);
 
+NNVM_REGISTER_OP(_contrib_conv2d_NCHWc_int8_prepacked)
+.describe(R"code(2D convolution layer in int8 using prepacked data and kernel.
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "5D Tensor", "Packed input data.")
+.add_argument("weight", "6D Tensor", "Packed weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
+.set_support_level(2);
 
 NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
 .describe(R"code(Weight transformation of winograd fast convolution algorithm.
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 980be6e634d5..450e4e3b442e 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -9,7 +9,7 @@
 
 from .conv2d_direct import schedule_direct_cuda
 from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
-from .conv2d_int8 import conv2d_int8_NCHWc, schedule_conv2d_int8_NCHWc
+from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8
 
 
 @autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
@@ -103,9 +103,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
         return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype,
                              pre_computed=False)
     if cfg.template_key == 'int8':
-        return conv2d_int8_NCHWc(cfg, data, kernel, strides, padding, layout, out_dtype)
+        return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype,
+                pre_computed=False)
 
-    print(cfg.template_key)
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
     elif layout == 'HWCN':
@@ -145,8 +145,8 @@ def _callback(op):
             schedule_direct_cuda(cfg, s, op.output(0))
         if op.tag == 'conv2d_nchw_winograd':
             schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
-        if op.tag == "conv2d_int8_NCHWc":
-            schedule_conv2d_int8_NCHWc(cfg, s, op.output(0))
+        if op.tag == "conv2d_NCHWc_int8":
+            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=False)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 43626c60b78c..de67ef25a3a3 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -3,13 +3,15 @@
 import tvm
 from tvm import autotvm
 
+from ..generic import schedule_conv2d_NCHWc_int8_prepacked
 from .tensor_intrin import dp4a
+from ..nn.conv2d import conv2d_NCHWc_int8_prepacked
 from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, get_const_int
+from ..util import get_const_tuple, get_const_int, traverse_inline
 
 
-def _conv2d_int8_NCHWc_arg_to_workload(data, kernel, stride, padding, layout, out_dtype):
+def _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype):
     """convert argument to workload"""
     shape = get_const_tuple(data.shape)
     if len(shape) == 5:
@@ -23,15 +25,15 @@ def _conv2d_int8_NCHWc_arg_to_workload(data, kernel, stride, padding, layout, ou
     if len(shape) == 6:
         oc_chunk, ic_chunk, KH, KW, oc_block, ic_block = shape
         raw_kernel = tvm.placeholder(
-            (oc_chunk, ic_chunk, KH, KW, oc_block, ic_block), dtype=kernel.dtype)
+            (oc_chunk*oc_block, ic_chunk*ic_block, KH, KW), dtype=kernel.dtype)
     else:
         raw_kernel = kernel
 
     return ('conv2d', ) + autotvm.task.task.args_to_workload(
-        [raw_data, raw_kernel, stride, padding, out_dtype])
+        [raw_data, raw_kernel, stride, padding, "NCHW", out_dtype])
 
 
-def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype):
+def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre_computed):
     """Convolution operator in NCHW[x]c layout for int8.
 
     Parameters
@@ -57,20 +59,23 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype):
     layout : str
         layout of data
 
-    out_dtype: str
+    out_dtype : str
         The output type. This is used for mixed precision.
 
+    pre_computed : str
+        Whether packed data and kernel are pre-computed
+
     Returns
     -------
     output : tvm.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
-    assert layout == 'NCHW'
+    assert layout == "NCHW" or layout == "NCHW4c"
 
     ic_block_factor = 4
     oc_block_factor = 4
 
-    if len(data.shape) == 4:
+    if not pre_computed:
         batch, channels, height, width = get_const_tuple(data.shape)
         assert channels % ic_block_factor == 0, \
             "Number of input channels should be multiple of {}".format(
@@ -79,10 +84,7 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype):
                                   lambda n, c, h, w, vc: kernel[n,
                                                                 c*ic_block_factor + vc, h, w],
                                   name="packed_data")
-    else:
-        packed_data = data
 
-    if len(kernel.shape) == 4:
         out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
             kernel.shape)
         assert out_channels % 4 == 0, \
@@ -95,7 +97,9 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype):
             kernel[oc_chunk * oc_block_factor + oc_block,
                    ic_chunk * ic_block_factor + ic_block, kh, kw],
             name="packed_kernel")
+
     else:
+        packed_data = data
         packed_kernel = kernel
 
     batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(
@@ -134,9 +138,9 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype):
 
     output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
                          conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
-                         tag="conv2d_int8_NCHWc",
-                         attrs={"workload": _conv2d_int8_NCHWc_arg_to_workload(
-                             data, kernel, stride, padding, layout, out_dtype)})
+                         tag="conv2d_NCHWc_int8",
+                         attrs={"workload": _conv2d_NCHWc_int8_arg_to_workload(
+                             data, kernel, stride, padding, out_dtype)})
 
     # num flop
     num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
@@ -149,7 +153,7 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype):
 _dp4a = dp4a('shared', 'shared', 'local')
 
 
-def schedule_conv2d_int8_NCHWc(cfg, s, output):
+def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     """Schedule conv2d int8 NCHWc template"""
     workload = output.op.attrs["workload"]
 
@@ -164,14 +168,14 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output):
     else:
         pad_data = packed_data
 
-    if autotvm.GLOBAL_SCOPE.in_tuning:
-        # skip this part during tuning to make recrods accurate
-        # this part will be pre-computed during NNVM's pre-compute optimization pass
-        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
-        s[packed_kernel].pragma(
-            s[packed_kernel].op.axis[0], "debug_skip_region")
-    else:
-        if isinstance(packed_data.op, tvm.tensor.ComputeOp):
+    if not pre_computed:
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # skip this part during tuning to make recrods accurate
+            # this part will be pre-computed during NNVM's pre-compute optimization pass
+            s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+            s[packed_kernel].pragma(
+                s[packed_kernel].op.axis[0], "debug_skip_region")
+        else:
             s[packed_data].compute_inline()
         if isinstance(packed_kernel.op, tvm.tensor.ComputeOp):
             s[packed_kernel].compute_inline()
@@ -244,7 +248,7 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output):
     for load in [AA, WW]:
         if load == AA:
             n, f, y, x, c = s[load].op.axis
-            if pad_data != packed_data and stride_h == 1 and stride_w == 1:
+            if pad_data == packed_data and stride_h == 1 and stride_w == 1:
                 s[load].vectorize(c)
                 fused = s[load].fuse(n, f, y, x)
             else:
@@ -263,12 +267,12 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output):
         s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
 
     # double buffer
-    for load in [AA, WW]:
-        name = load.op.name + '_double_buffer'
-        cfg.define_knob(name, [0, 1])
-
-        if cfg[name].val:
-            s[load].double_buffer()
+    cfg.define_knob('AA_double_buffer', [0, 1])
+    cfg.define_knob('WW_double_buffer', [0, 1])
+    if cfg['AA_double_buffer'].val:
+        s[AA].double_buffer()
+    if cfg['WW_double_buffer'].val:
+        s[WW].double_buffer()
 
     # unroll
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
@@ -277,3 +281,43 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output):
     s[output].pragma(kernel_scope, 'unroll_explicit', False)
 
     return s
+
+
+@conv2d_NCHWc_int8_prepacked.register(["cuda", "gpu"])
+@autotvm.task.dispatcher
+def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout, out_dtype):
+    assert layout == 'NCHW4c'
+    return _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype)
+
+
+@conv2d_NCHWc_int8_prepacked_dispatcher.register("int8")
+def _decl_conv2d_NCHWc_int8_prepacked(cfg, data, kernel, stride, padding, layout, out_dtype):
+    return conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype,
+            pre_computed=True)
+
+@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda", "gpu"], ["int8"])
+def schedule_conv2d_NCHWc_int8_prepacked_cuda(cfg, outs):
+    """TOPI schedule callback of conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_NCHWc_int8' in op.tag:
+            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 7e0574ea606b..f2b6d960135c 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -375,6 +375,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         if cfg.template_key == 'direct':
             return None
 
+        if cfg.template_key == 'int8':
+            new_attrs['layout'] = 'NCHW4c'
+            new_attrs['out_layout'] = 'NCHW4c'
+            new_attrs['kernel_layout'] = 'OIHW4o4i'
+            return sym.contrib.conv2d_NCHWc_int8_prepacked(*copy_inputs, **new_attrs)
+
         # pre-compute weight transformation in winograd
         tile_size = _infer_tile_size(tinfos[0], tinfos[1])
 
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 874decc792ec..e99ce263296b 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -139,6 +139,24 @@ def schedule_conv2d_winograd_without_weight_transform(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_conv2d_NCHWc_int8_prepacked(outs):
+    """Schedule for conv2d NCHWc int8 with prepacked data and kernel
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_conv2d_transpose_nchw(outs):
     """Schedule for conv2d_transpose_nchw
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index e0d2c403d4b4..e389b1eb4bfc 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -397,3 +397,37 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
+
+
+@tvm.target.generic_func
+def conv2d_NCHWc_int8_prepacked(data, kernel, stride, padding, layout, out_dtype):
+    """Convolution operator in NCHW[x]c layout for int8. Data and kernel should be packed in
+    advance.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
+    """
+    raise ValueError("missing register for topi.nn.conv2d_NCHWc_int8_prepacked")

From 35df666aec30ebbe7169bb19240a9ec1ee3fec4a Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 20 Sep 2018 10:38:33 +0800
Subject: [PATCH 03/16] Fix computing packed_kernel

---
 topi/python/topi/cuda/conv2d_int8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index de67ef25a3a3..4d666ca96e4b 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -177,7 +177,6 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
                 s[packed_kernel].op.axis[0], "debug_skip_region")
         else:
             s[packed_data].compute_inline()
-        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp):
             s[packed_kernel].compute_inline()
 
     if pad_data != packed_data:

From 5a2e6ff81a90f835a57a8200fccc187344c50974 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 20 Sep 2018 10:44:38 +0800
Subject: [PATCH 04/16] Fix style

---
 nnvm/python/nnvm/top/nn.py           |  7 ++++---
 topi/python/topi/cuda/conv2d.py      |  2 +-
 topi/python/topi/cuda/conv2d_int8.py | 15 ++++++++-------
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index cee0d7741977..bdf949ff4d87 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -209,7 +209,8 @@ def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _):
     assert dilation == (1, 1), "not support dilate now"
     if groups == 1:
         # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout, out_dtype)
+        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout,
+                                                  out_dtype)
         # pylint: enable=assignment-from-no-return
     else:
         raise ValueError("not support arbitrary group number > 1 for now")
@@ -218,7 +219,7 @@ def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _):
         bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
         out = topi.add(out, bias)
     return out
- 
+
 @reg.register_schedule("_contrib_conv2d_NCHWc_int8_prepacked")
 def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target):
     with tvm.target.create(target):
@@ -227,7 +228,7 @@ def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target):
 reg.register_pattern("_contrib_conv2d_NCHWc_int8", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
-    
+
 @reg.register_compute("_contrib_conv2d_winograd_weight_transform")
 def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _):
     return topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size'))
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 450e4e3b442e..4dac40746419 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -104,7 +104,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
                              pre_computed=False)
     if cfg.template_key == 'int8':
         return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype,
-                pre_computed=False)
+                                 pre_computed=False)
 
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 4d666ca96e4b..5dcc8a52e209 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -70,7 +70,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
     output : tvm.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
-    assert layout == "NCHW" or layout == "NCHW4c"
+    assert layout in ["NCHW", "NCHW4c"]
 
     ic_block_factor = 4
     oc_block_factor = 4
@@ -80,7 +80,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
         assert channels % ic_block_factor == 0, \
             "Number of input channels should be multiple of {}".format(
                 ic_block_factor)
-        packed_data = tvm.compute((batch, channels/ic_block_factor, height, width, ic_block_factor),
+        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
+                                   ic_block_factor),
                                   lambda n, c, h, w, vc: kernel[n,
                                                                 c*ic_block_factor + vc, h, w],
                                   name="packed_data")
@@ -91,7 +92,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
             "Number of output channels should be multiple of {}".format(
                 oc_block_factor)
         packed_kernel = tvm.compute(
-            (out_channels / oc_block_factor, in_channels / ic_block_factor, kernel_h, kernel_w,
+            (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w,
              oc_block_factor, ic_block_factor),
             lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
             kernel[oc_chunk * oc_block_factor + oc_block,
@@ -282,7 +283,7 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     return s
 
 
-@conv2d_NCHWc_int8_prepacked.register(["cuda", "gpu"])
+@conv2d_NCHWc_int8_prepacked.register(["cuda"])
 @autotvm.task.dispatcher
 def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout, out_dtype):
     assert layout == 'NCHW4c'
@@ -292,11 +293,11 @@ def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout
 @conv2d_NCHWc_int8_prepacked_dispatcher.register("int8")
 def _decl_conv2d_NCHWc_int8_prepacked(cfg, data, kernel, stride, padding, layout, out_dtype):
     return conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype,
-            pre_computed=True)
+                             pre_computed=True)
 
-@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda", "gpu"], ["int8"])
+@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda"], ["int8"])
 def schedule_conv2d_NCHWc_int8_prepacked_cuda(cfg, outs):
-    """TOPI schedule callback of conv2d for cuda gpu
+    """TOPI schedule callback of conv2d for cuda
 
     Parameters
     ----------

From 74bced573a46754c41a282733e06d8ffa89899e8 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 20 Sep 2018 13:50:04 +0800
Subject: [PATCH 05/16] Compute flop in conv2d_direct manually to support int8

---
 topi/python/topi/cuda/conv2d_direct.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py
index 19e7ea38f647..9b315a6b0fc1 100644
--- a/topi/python/topi/cuda/conv2d_direct.py
+++ b/topi/python/topi/cuda/conv2d_direct.py
@@ -2,6 +2,7 @@
 """The templates for cuda conv2d operators"""
 import tvm
 from tvm import autotvm
+from ..util import get_const_tuple
 
 def schedule_direct_cuda(cfg, s, conv):
     """schedule optimized for batch size = 1"""
@@ -94,3 +95,7 @@ def schedule_direct_cuda(cfg, s, conv):
     # unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
     s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    N, CO, OH, OW = get_const_tuple(output.shape)
+    _, KH, KW, CI = get_const_tuple(kernel.shape)
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)

From b673f008de4c654d3ca583d43d44aa4ad1a55cd5 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 21 Sep 2018 10:41:25 +0800
Subject: [PATCH 06/16] Handle bias in conv2d_int8 schedule

---
 topi/python/topi/cuda/conv2d_int8.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 5dcc8a52e209..d6da1eb1e1cf 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -183,8 +183,6 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     if pad_data != packed_data:
         s[pad_data].compute_inline()
 
-    s[conv].set_scope('local')
-
     batch = get_const_int(packed_data.shape[0])
     if isinstance(stride, int):
         stride_h = stride_w = stride
@@ -195,6 +193,13 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     AA = s.cache_read(pad_data, 'shared', [conv])
     WW = s.cache_read(packed_kernel, 'shared', [conv])
 
+    s[conv].set_scope('local')
+
+    # handle bias
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0].output(0)
+
     # tile and bind spatial axes
     n, f, y, x, c = s[output].op.axis
     cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)

From c48e1fb40349693fd9a70204c0f3aed7f274d8cd Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 21 Sep 2018 11:33:57 +0800
Subject: [PATCH 07/16] Fix wrong variable

---
 topi/python/topi/cuda/conv2d_int8.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index d6da1eb1e1cf..a1b73691d5cc 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -82,8 +82,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
                 ic_block_factor)
         packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
                                    ic_block_factor),
-                                  lambda n, c, h, w, vc: kernel[n,
-                                                                c*ic_block_factor + vc, h, w],
+                                  lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
                                   name="packed_data")
 
         out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(

From a037241b0ba7710b07eb11368ba7d5d6a6d7032c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 21 Sep 2018 14:02:24 +0800
Subject: [PATCH 08/16] Improve style

---
 topi/python/topi/cuda/conv2d_int8.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index a1b73691d5cc..03bbfc0f876b 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -107,8 +107,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre
     oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
         packed_kernel.shape)
 
-    stride_h, stride_w = (stride, stride) if isinstance(
-        stride, int) else stride
+    if isinstance(stride, int):
+        stride_h, stride_w = stride
+    else:
+        stride_h, stride_w = stride
 
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (kernel_h, kernel_w))

From d685e0d1419e1bf6b0fe0ed8db2b8b14f9349edc Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 21 Sep 2018 18:48:28 +0800
Subject: [PATCH 09/16] Handle dilate and packing schedule correctly

---
 topi/python/topi/cuda/conv2d_int8.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 03bbfc0f876b..4e1b2672bed2 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -3,6 +3,7 @@
 import tvm
 from tvm import autotvm
 
+from .injective import _schedule_injective
 from ..generic import schedule_conv2d_NCHWc_int8_prepacked
 from .tensor_intrin import dp4a
 from ..nn.conv2d import conv2d_NCHWc_int8_prepacked
@@ -171,6 +172,7 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
         pad_data = packed_data
 
     if not pre_computed:
+        kernel, = packed_kernel.op.input_tensors
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # skip this part during tuning to make recrods accurate
             # this part will be pre-computed during NNVM's pre-compute optimization pass
@@ -178,8 +180,13 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
             s[packed_kernel].pragma(
                 s[packed_kernel].op.axis[0], "debug_skip_region")
         else:
-            s[packed_data].compute_inline()
-            s[packed_kernel].compute_inline()
+            _schedule_injective(packed_data.op, s)
+            _schedule_injective(packed_kernel.op, s)
+    else:
+        kernel = packed_data
+
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
 
     if pad_data != packed_data:
         s[pad_data].compute_inline()

From 0809b0ea30b3fd74aa06ee464ed1bc044556c8be Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 21 Sep 2018 19:23:58 +0800
Subject: [PATCH 10/16] Add unittests of conv2d_NCHWc_int8

---
 topi/tests/python/test_topi_conv2d_int8.py | 173 +++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 topi/tests/python/test_topi_conv2d_int8.py

diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
new file mode 100644
index 000000000000..bb441891155f
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -0,0 +1,173 @@
+"""Example code to do convolution."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend
+
+oc_block_factor = 4
+
+
+def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8')
+    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias', 
+                            dtype='int8')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
+
+        # convert to NCHWc
+        _, _, out_height, out_width = c_np.shape
+        c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \
+                out_height, out_width)).transpose(0, 1, 3, 4, 2)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
+            C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding),
+                               layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["cuda"]:
+        check_device(device)
+
+
+class NCHWcInt8Fallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'int8'
+        self.memory[key] = cfg
+        return cfg
+
+
+def test_conv2d_nchw():
+    with NCHWcInt8Fallback():
+        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
+        verify_conv2d_NCHWc_int8(1,  64,  56,  64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,  64,  56,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  64,  56, 128, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1,  64,  56, 128, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 128, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 256, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 256, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 256, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 512, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 512, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 512,   7, 512, 3, 1, 1)
+
+        # bias, relu
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
+        # batch size
+        verify_conv2d_NCHWc_int8(4, 64, 56, 64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(9, 64, 56, 64, 3, 1, 1)
+
+        # weird workloads
+        verify_conv2d_NCHWc_int8(4, 4, 4, 4, 4, 4, 4)
+
+        # inception v3 workloads where channels in / out are multiple of oc_block_factor
+        verify_conv2d_NCHWc_int8(1,   32, 149,  32, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   32, 147,  64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,   64,  73,  80, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   80,  73, 192, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   48,  35,  64, 5, 1, 2)
+        verify_conv2d_NCHWc_int8(1,   64,  35,  96, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,   96,  35,  96, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  32, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  256,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  256,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35, 384, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,   96,  35,  96, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 128, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 128, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 128, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 160, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 160, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 160, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 320, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 320, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  384,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  384,   8, 384, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 448, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  448,   8, 384, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 320, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 448, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 1024,  19,  84, 3, 1, 1)
+
+
+if __name__ == "__main__":
+    test_conv2d_nchw()

From 9553a63aa5d1e56257c555427b7156c80328102d Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 25 Sep 2018 13:37:42 +0800
Subject: [PATCH 11/16] Add device compatability check in unittest

---
 topi/tests/python/test_topi_conv2d_int8.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index bb441891155f..2b85b2b97cb1 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -21,7 +21,7 @@ def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, str
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
     W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8')
-    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias', 
+    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
                             dtype='int8')
 
     a_shape = get_const_tuple(A.shape)
@@ -57,6 +57,10 @@ def check_device(device):
         if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             dW = topi.nn.dilate(W, (1, 1, dilation, dilation))

From ecad8bf05e80804218f8ef02bbc5c4337d247783 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 25 Sep 2018 16:06:01 +0800
Subject: [PATCH 12/16] Use sym.conv2d instead of new symbol

---
 nnvm/python/nnvm/top/nn.py               | 47 ++++++------------------
 nnvm/src/top/nn/convolution.cc           | 17 ---------
 topi/python/topi/cuda/conv2d_winograd.py |  2 +-
 3 files changed, 13 insertions(+), 53 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index bdf949ff4d87..49192cacd713 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -90,10 +90,12 @@ def compute_conv2d(attrs, inputs, _):
     kernel_layout = attrs["kernel_layout"]
     out_dtype = attrs["out_dtype"]
     out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    assert layout == "NCHW" or layout == "NHWC"
+    assert layout in ["NCHW", "NHWC", "NCHW4c"]
     (dilation_h, dilation_w) = dilation
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
+    elif layout == "NCHW4c" and (dilation_h > 1 or dilation_w > 1):
+        raise ValueError("not support dilate now")
     elif dilation == (1, 1):
         kernel = inputs[1]
     elif layout == "NCHW":
@@ -101,7 +103,12 @@ def compute_conv2d(attrs, inputs, _):
     else: #layout == NHWC
         kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])
 
-    if groups == 1:
+    if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
+        # pylint: disable=assignment-from-no-return
+        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], kernel, strides, padding,
+                                                  layout, out_dtype=out_dtype)
+        # pylint: enable=assignment-from-no-return
+    elif groups == 1:
         out = topi.nn.conv2d(
             inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype)
     elif layout == "NCHW" and \
@@ -120,7 +127,7 @@ def compute_conv2d(attrs, inputs, _):
 
     if attrs.get_bool("use_bias"):
         bias = inputs[2]
-        expand_axis = 1 if layout == "NCHW" else 0
+        expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0
         bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
         out = topi.add(out, bias)
     return out
@@ -136,6 +143,8 @@ def schedule_conv2d(attrs, outs, target):
     with tvm.target.create(target):
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NCHW4c":
+            return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs)
         elif groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
         elif groups == channels and layout == "NCHW":
@@ -196,38 +205,6 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
 
 reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
 
-@reg.register_compute("_contrib_conv2d_NCHWc_int8_prepacked")
-def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _):
-    """Compute definition of int8 conv2d NCHWc"""
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    layout = attrs.get_string("layout")
-    out_dtype = attrs.get_string("out_dtype")
-    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    assert dilation == (1, 1), "not support dilate now"
-    if groups == 1:
-        # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout,
-                                                  out_dtype)
-        # pylint: enable=assignment-from-no-return
-    else:
-        raise ValueError("not support arbitrary group number > 1 for now")
-    if attrs.get_bool("use_bias"):
-        bias = inputs[2]
-        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
-        out = topi.add(out, bias)
-    return out
-
-@reg.register_schedule("_contrib_conv2d_NCHWc_int8_prepacked")
-def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target):
-    with tvm.target.create(target):
-        return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs)
-
-reg.register_pattern("_contrib_conv2d_NCHWc_int8", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
 
 @reg.register_compute("_contrib_conv2d_winograd_weight_transform")
 def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _):
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 0a6e2d09b73f..22bda048a0a2 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -344,23 +344,6 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2);
 
-NNVM_REGISTER_OP(_contrib_conv2d_NCHWc_int8_prepacked)
-.describe(R"code(2D convolution layer in int8 using prepacked data and kernel.
-)code" NNVM_ADD_FILELINE)
-.add_argument("data", "5D Tensor", "Packed input data.")
-.add_argument("weight", "6D Tensor", "Packed weight matrix.")
-.add_argument("bias", "1D Tensor", "Bias parameter.")
-.add_arguments(Conv2DParam::__FIELDS__())
-.set_attr_parser(ParamParser<Conv2DParam>)
-.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
-.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
-.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
-.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
-.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
-.set_num_outputs(1)
-.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
-.set_support_level(2);
-
 NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
 .describe(R"code(Weight transformation of winograd fast convolution algorithm.
 Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index f2b6d960135c..588859805c66 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -379,7 +379,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
             new_attrs['layout'] = 'NCHW4c'
             new_attrs['out_layout'] = 'NCHW4c'
             new_attrs['kernel_layout'] = 'OIHW4o4i'
-            return sym.contrib.conv2d_NCHWc_int8_prepacked(*copy_inputs, **new_attrs)
+            return sym.conv2d(*copy_inputs, **new_attrs)
 
         # pre-compute weight transformation in winograd
         tile_size = _infer_tile_size(tinfos[0], tinfos[1])

From 71e4754b6808adca9fe8b5a8137ff23b6aa63d9a Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 25 Sep 2018 17:17:44 +0800
Subject: [PATCH 13/16] Add assert on target

---
 topi/python/topi/cuda/conv2d_winograd.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 588859805c66..573e1375f0c7 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -376,6 +376,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
             return None
 
         if cfg.template_key == 'int8':
+            assert tvm.target.current_target() == 'cuda'
             new_attrs['layout'] = 'NCHW4c'
             new_attrs['out_layout'] = 'NCHW4c'
             new_attrs['kernel_layout'] = 'OIHW4o4i'

From 7bc4c09b1b7255ce1c571d94dc1dd18cbd475d91 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 26 Sep 2018 10:31:21 +0800
Subject: [PATCH 14/16] Fix assert on target

---
 topi/python/topi/cuda/conv2d_winograd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 573e1375f0c7..6a0a126b9e4f 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -376,7 +376,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
             return None
 
         if cfg.template_key == 'int8':
-            assert tvm.target.current_target() == 'cuda'
+            assert 'cuda' in tvm.target.current_target().keys
             new_attrs['layout'] = 'NCHW4c'
             new_attrs['out_layout'] = 'NCHW4c'
             new_attrs['kernel_layout'] = 'OIHW4o4i'

From d8492ceb25ac18cce060dd166f1068590c76ef87 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 26 Sep 2018 18:35:29 +0800
Subject: [PATCH 15/16] Bind block index to f, y, x

---
 topi/python/topi/cuda/conv2d_int8.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 4e1b2672bed2..a7e673f25f12 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -221,12 +221,8 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     # this is the scope to attach global config inside this kernel
     kernel_scope, n = s[output].split(n, nparts=1)
 
-    max_block_z = 128
-    if batch > max_block_z:
-        _, n = s[output].split(n, factor=max_block_z)
-    s[output].bind(n, tvm.thread_axis("blockIdx.z"))
-
-    s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
     s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
     s[output].bind(vf, tvm.thread_axis("vthread"))
     s[output].bind(vy, tvm.thread_axis("vthread"))

From 4521c6a6e100bc2ccdf900fa58cd8c3a838d4ce5 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 27 Sep 2018 10:16:56 +0800
Subject: [PATCH 16/16] Fuse by and bx

---
 topi/python/topi/cuda/conv2d_int8.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index a7e673f25f12..053c9bc6bd31 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -221,16 +221,20 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed):
     # this is the scope to attach global config inside this kernel
     kernel_scope, n = s[output].split(n, nparts=1)
 
-    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    max_block_z = 128
+    if batch > max_block_z:
+        _, n = s[output].split(n, factor=max_block_z)
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    fused_byx = s[output].fuse(by, bx)
+    s[output].bind(n, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(fused_byx, tvm.thread_axis("blockIdx.x"))
     s[output].bind(vf, tvm.thread_axis("vthread"))
     s[output].bind(vy, tvm.thread_axis("vthread"))
     s[output].bind(vx, tvm.thread_axis("vthread"))
     s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
     s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
     s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
-    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
 
     s[conv].compute_at(s[output], tx)