apache · tqchen · Oct 2, 2018 · Sep 14, 2018 · Sep 19, 2018 · Sep 20, 2018
diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
@@ -90,18 +90,25 @@ def compute_conv2d(attrs, inputs, _):
     kernel_layout = attrs["kernel_layout"]
     out_dtype = attrs["out_dtype"]
     out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    assert layout == "NCHW" or layout == "NHWC"
+    assert layout in ["NCHW", "NHWC", "NCHW4c"]
     (dilation_h, dilation_w) = dilation
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
+    elif layout == "NCHW4c" and (dilation_h > 1 or dilation_w > 1):
+        raise ValueError("not support dilate now")
     elif dilation == (1, 1):
         kernel = inputs[1]
     elif layout == "NCHW":
         kernel = topi.nn.dilate(inputs[1], [1, 1, dilation_h, dilation_w])
     else: #layout == NHWC
         kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])
 
-    if groups == 1:
+    if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
+        # pylint: disable=assignment-from-no-return
+        out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], kernel, strides, padding,
+                                                  layout, out_dtype=out_dtype)
+        # pylint: enable=assignment-from-no-return
+    elif groups == 1:
         out = topi.nn.conv2d(
             inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype)
     elif layout == "NCHW" and \
@@ -120,7 +127,7 @@ def compute_conv2d(attrs, inputs, _):
 
     if attrs.get_bool("use_bias"):
         bias = inputs[2]
-        expand_axis = 1 if layout == "NCHW" else 0
+        expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0
         bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
         out = topi.add(out, bias)
     return out
@@ -136,6 +143,8 @@ def schedule_conv2d(attrs, outs, target):
     with tvm.target.create(target):
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NCHW4c":
+            return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs)
         elif groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
         elif groups == channels and layout == "NCHW":

diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
@@ -344,7 +344,6 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2);
 
-
 NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
 .describe(R"code(Weight transformation of winograd fast convolution algorithm.
 Separate this into another nnvm symbol in order to enable Precompute Pass to compute the

diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
@@ -9,9 +9,10 @@
 
 from .conv2d_direct import schedule_direct_cuda
 from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
+from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8
 
 
-@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd'])
+@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
 def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for cuda backend.
 
@@ -21,10 +22,13 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
         The config for this template
 
     data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]
 
     kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
 
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -98,6 +102,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
     if cfg.template_key == 'winograd':
         return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype,
                              pre_computed=False)
+    if cfg.template_key == 'int8':
+        return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype,
+                                 pre_computed=False)
 
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
@@ -108,7 +115,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
 
 
 @autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
-                                ["direct", 'winograd'])
+                                ["direct", 'winograd', "int8"])
 def schedule_conv2d_nchw_cuda(cfg, outs):
     """TOPI schedule callback of conv2d for cuda gpu
 
@@ -138,6 +145,8 @@ def _callback(op):
             schedule_direct_cuda(cfg, s, op.output(0))
         if op.tag == 'conv2d_nchw_winograd':
             schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
+        if op.tag == "conv2d_NCHWc_int8":
+            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=False)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py
@@ -2,6 +2,7 @@
 """The templates for cuda conv2d operators"""
 import tvm
 from tvm import autotvm
+from ..util import get_const_tuple
 
 def schedule_direct_cuda(cfg, s, conv):
     """schedule optimized for batch size = 1"""
@@ -94,3 +95,7 @@ def schedule_direct_cuda(cfg, s, conv):
     # unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
     s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    N, CO, OH, OW = get_const_tuple(output.shape)
+    _, KH, KW, CI = get_const_tuple(kernel.shape)
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)