From 2bab6c8a856d2d8363ceaf60f3aba05baabe6ead Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 14 Sep 2018 15:39:59 +0800 Subject: [PATCH 01/16] Add int8 conv2d NCHWc template --- topi/python/topi/cuda/conv2d.py | 17 +- topi/python/topi/cuda/conv2d_int8.py | 279 +++++++++++++++++++++++++++ 2 files changed, 292 insertions(+), 4 deletions(-) create mode 100644 topi/python/topi/cuda/conv2d_int8.py diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py index a7d5f742d98c..980be6e634d5 100644 --- a/topi/python/topi/cuda/conv2d.py +++ b/topi/python/topi/cuda/conv2d.py @@ -9,9 +9,10 @@ from .conv2d_direct import schedule_direct_cuda from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda +from .conv2d_int8 import conv2d_int8_NCHWc, schedule_conv2d_int8_NCHWc -@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd']) +@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8']) def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'): """Conv2D operator for cuda backend. @@ -21,10 +22,13 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f The config for this template data : tvm.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] + 4-D with shape [batch, in_channel, in_height, in_width] or + 5-D with shape [batch, ic_chunk, in_height, in_width, ic_block] kernel : tvm.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] + 4-D with shape [num_filter, in_channel, filter_height, filter_width] or + 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, + filter_width, num_filter_block, in_channel_block] strides : int or a list/tuple of two ints stride size, or [stride_height, stride_width] @@ -98,7 +102,10 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f if cfg.template_key == 'winograd': return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_computed=False) + if cfg.template_key == 'int8': + return conv2d_int8_NCHWc(cfg, data, kernel, strides, padding, layout, out_dtype) + print(cfg.template_key) if layout == 'NCHW': return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype) elif layout == 'HWCN': @@ -108,7 +115,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f @autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"], - ["direct", 'winograd']) + ["direct", 'winograd', "int8"]) def schedule_conv2d_nchw_cuda(cfg, outs): """TOPI schedule callback of conv2d for cuda gpu @@ -138,6 +145,8 @@ def _callback(op): schedule_direct_cuda(cfg, s, op.output(0)) if op.tag == 'conv2d_nchw_winograd': schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False) + if op.tag == "conv2d_int8_NCHWc": + schedule_conv2d_int8_NCHWc(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py new file mode 100644 index 000000000000..43626c60b78c --- /dev/null +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -0,0 +1,279 @@ +# pylint: disable=invalid-name +"""Int8 conv2d in NCHWc layout""" +import tvm +from tvm import autotvm + +from .tensor_intrin import dp4a +from ..nn.pad import pad +from ..nn.util import get_pad_tuple +from ..util import get_const_tuple, get_const_int + + +def _conv2d_int8_NCHWc_arg_to_workload(data, kernel, stride, padding, layout, out_dtype): + """convert argument to workload""" + shape = get_const_tuple(data.shape) + if len(shape) == 5: + N, ic_chunk, H, W, ic_block = shape + raw_data = tvm.placeholder( + (N, ic_chunk*ic_block, H, W), dtype=data.dtype) + else: + raw_data = data + + shape = get_const_tuple(kernel.shape) + if len(shape) == 6: + oc_chunk, ic_chunk, KH, KW, oc_block, ic_block = shape + raw_kernel = tvm.placeholder( + (oc_chunk, ic_chunk, KH, KW, oc_block, ic_block), dtype=kernel.dtype) + else: + raw_kernel = kernel + + return ('conv2d', ) + autotvm.task.task.args_to_workload( + [raw_data, raw_kernel, stride, padding, out_dtype]) + + +def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype): + """Convolution operator in NCHW[x]c layout for int8. + + Parameters + ---------- + cfg: ConfigEntity + The config for this template + + data : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] or + 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] + + kernel : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] or + 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, + filter_width, num_filter_block, in_channel_block] + + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + + padding: int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + + layout : str + layout of data + + out_dtype: str + The output type. This is used for mixed precision. + + Returns + ------- + output : tvm.Tensor + 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] + """ + assert layout == 'NCHW' + + ic_block_factor = 4 + oc_block_factor = 4 + + if len(data.shape) == 4: + batch, channels, height, width = get_const_tuple(data.shape) + assert channels % ic_block_factor == 0, \ + "Number of input channels should be multiple of {}".format( + ic_block_factor) + packed_data = tvm.compute((batch, channels/ic_block_factor, height, width, ic_block_factor), + lambda n, c, h, w, vc: kernel[n, + c*ic_block_factor + vc, h, w], + name="packed_data") + else: + packed_data = data + + if len(kernel.shape) == 4: + out_channels, in_channels, kernel_h, kernel_w = get_const_tuple( + kernel.shape) + assert out_channels % 4 == 0, \ + "Number of output channels should be multiple of {}".format( + oc_block_factor) + packed_kernel = tvm.compute( + (out_channels / oc_block_factor, in_channels / ic_block_factor, kernel_h, kernel_w, + oc_block_factor, ic_block_factor), + lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: + kernel[oc_chunk * oc_block_factor + oc_block, + ic_chunk * ic_block_factor + ic_block, kh, kw], + name="packed_kernel") + else: + packed_kernel = kernel + + batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple( + packed_data.shape) + oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple( + packed_kernel.shape) + + stride_h, stride_w = (stride, stride) if isinstance( + stride, int) else stride + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + padding, (kernel_h, kernel_w)) + # compute graph + pad_before = [0, 0, pad_top, pad_left, 0] + pad_after = [0, 0, pad_down, pad_right, 0] + pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") + + # compute the output shape + out_height = (in_height - kernel_h + pad_top + pad_down) // stride_h + 1 + out_width = (in_width - kernel_w + pad_left + pad_right) // stride_w + 1 + + oshape = (batch, oc_chunk, out_height, out_width, oc_block) + + icc = tvm.reduce_axis((0, ic_chunk), name='ic_chunk') + icb = tvm.reduce_axis((0, ic_block), name='ic_block') + kh = tvm.reduce_axis((0, kernel_h), name='kh') + kw = tvm.reduce_axis((0, kernel_w), name='kw') + + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + tvm.sum(pad_data[n, icc, oh*stride_h+kh, ow*stride_w+kw, icb] + .astype('int32') * + packed_kernel[oc_chunk, icc, + kh, kw, oc_block, icb] + .astype('int32'), + axis=[icc, kh, kw, icb])) + + output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype), + tag="conv2d_int8_NCHWc", + attrs={"workload": _conv2d_int8_NCHWc_arg_to_workload( + data, kernel, stride, padding, layout, out_dtype)}) + + # num flop + num_flop = batch * oc_chunk * oc_block * out_height * out_width * \ + ic_chunk * ic_block * kernel_h * kernel_w * 2 + cfg.add_flop(num_flop) + + return output + + +_dp4a = dp4a('shared', 'shared', 'local') + + +def schedule_conv2d_int8_NCHWc(cfg, s, output): + """Schedule conv2d int8 NCHWc template""" + workload = output.op.attrs["workload"] + + stride = workload[3] + + conv = output.op.input_tensors[0] + packed_data, packed_kernel = conv.op.input_tensors + + if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag: + pad_data = packed_data + packed_data = pad_data.op.input_tensors[0] + else: + pad_data = packed_data + + if autotvm.GLOBAL_SCOPE.in_tuning: + # skip this part during tuning to make recrods accurate + # this part will be pre-computed during NNVM's pre-compute optimization pass + s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region") + s[packed_kernel].pragma( + s[packed_kernel].op.axis[0], "debug_skip_region") + else: + if isinstance(packed_data.op, tvm.tensor.ComputeOp): + s[packed_data].compute_inline() + if isinstance(packed_kernel.op, tvm.tensor.ComputeOp): + s[packed_kernel].compute_inline() + + if pad_data != packed_data: + s[pad_data].compute_inline() + + s[conv].set_scope('local') + + batch = get_const_int(packed_data.shape[0]) + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + # create cache stage + AA = s.cache_read(pad_data, 'shared', [conv]) + WW = s.cache_read(packed_kernel, 'shared', [conv]) + + # tile and bind spatial axes + n, f, y, x, c = s[output].op.axis + cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) + cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) + cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) + + bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + # this is the scope to attach global config inside this kernel + kernel_scope, n = s[output].split(n, nparts=1) + + max_block_z = 128 + if batch > max_block_z: + _, n = s[output].split(n, factor=max_block_z) + s[output].bind(n, tvm.thread_axis("blockIdx.z")) + + s[output].bind(bf, tvm.thread_axis("blockIdx.y")) + s[output].bind(bx, tvm.thread_axis("blockIdx.x")) + s[output].bind(vf, tvm.thread_axis("vthread")) + s[output].bind(vy, tvm.thread_axis("vthread")) + s[output].bind(vx, tvm.thread_axis("vthread")) + s[output].bind(tf, tvm.thread_axis("threadIdx.z")) + s[output].bind(ty, tvm.thread_axis("threadIdx.y")) + s[output].bind(tx, tvm.thread_axis("threadIdx.x")) + s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) + + s[conv].compute_at(s[output], tx) + + # tile and bind reduction axes + n, f, y, x, c = s[conv].op.axis + + rc, ry, rx, rc_block = s[conv].op.reduce_axis + cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2) + cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2) + cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2) + rco, rci = cfg['tile_rc'].apply(s, conv, rc) + ryo, ryi = cfg['tile_ry'].apply(s, conv, ry) + rxo, rxi = cfg['tile_rx'].apply(s, conv, rx) + + s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block) + + _, rc_block = s[conv].split(rc_block, factor=4) + s[conv].tensorize(rc_block, _dp4a) + + s[AA].compute_at(s[conv], rxo) + s[WW].compute_at(s[conv], rxo) + + # cooperative fetching + for load in [AA, WW]: + if load == AA: + n, f, y, x, c = s[load].op.axis + if pad_data != packed_data and stride_h == 1 and stride_w == 1: + s[load].vectorize(c) + fused = s[load].fuse(n, f, y, x) + else: + c, _ = s[load].split(c, factor=4) + fused = s[load].fuse(n, f, y, x, c) + else: + n, f, y, x, oc_chunk, c = s[load].op.axis + fused = s[load].fuse(n, f, y, x, oc_chunk) + s[load].vectorize(c) + + fused, tx = s[load].split(fused, factor=cfg["tile_x"].size[2]) + fused, ty = s[load].split(fused, factor=cfg["tile_y"].size[2]) + fused, tz = s[load].split(fused, factor=cfg["tile_f"].size[2]) + s[load].bind(tz, tvm.thread_axis("threadIdx.z")) + s[load].bind(ty, tvm.thread_axis("threadIdx.y")) + s[load].bind(tx, tvm.thread_axis("threadIdx.x")) + + # double buffer + for load in [AA, WW]: + name = load.op.name + '_double_buffer' + cfg.define_knob(name, [0, 1]) + + if cfg[name].val: + s[load].double_buffer() + + # unroll + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + s[output].pragma(kernel_scope, 'auto_unroll_max_step', + cfg['auto_unroll_max_step'].val) + s[output].pragma(kernel_scope, 'unroll_explicit', False) + + return s From 2fd0a15f3081e651764e7ca6b4d3cd9aa8f4f32d Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 19 Sep 2018 18:51:28 +0800 Subject: [PATCH 02/16] Add NNVM symbol of conv2d NCHWc int8 --- nnvm/python/nnvm/top/nn.py | 31 +++++++ nnvm/src/top/nn/convolution.cc | 16 ++++ topi/python/topi/cuda/conv2d.py | 10 +-- topi/python/topi/cuda/conv2d_int8.py | 104 ++++++++++++++++------- topi/python/topi/cuda/conv2d_winograd.py | 6 ++ topi/python/topi/generic/nn.py | 18 ++++ topi/python/topi/nn/conv2d.py | 34 ++++++++ 7 files changed, 184 insertions(+), 35 deletions(-) diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index b452738123c3..cee0d7741977 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -196,7 +196,38 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target): reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE) +@reg.register_compute("_contrib_conv2d_NCHWc_int8_prepacked") +def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _): + """Compute definition of int8 conv2d NCHWc""" + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs.get_string("layout") + out_dtype = attrs.get_string("out_dtype") + out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype + assert dilation == (1, 1), "not support dilate now" + if groups == 1: + # pylint: disable=assignment-from-no-return + out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout, out_dtype) + # pylint: enable=assignment-from-no-return + else: + raise ValueError("not support arbitrary group number > 1 for now") + if attrs.get_bool("use_bias"): + bias = inputs[2] + bias = topi.expand_dims(bias, axis=1, num_newaxis=2) + out = topi.add(out, bias) + return out + +@reg.register_schedule("_contrib_conv2d_NCHWc_int8_prepacked") +def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target): + with tvm.target.create(target): + return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs) + +reg.register_pattern("_contrib_conv2d_NCHWc_int8", OpPattern.OUT_ELEMWISE_FUSABLE) + + @reg.register_compute("_contrib_conv2d_winograd_weight_transform") def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _): return topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size')) diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc index d5c9c18f68a6..0a6e2d09b73f 100644 --- a/nnvm/src/top/nn/convolution.cc +++ b/nnvm/src/top/nn/convolution.cc @@ -344,6 +344,22 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc) .set_num_inputs(UseBiasNumInputs) .set_support_level(2); +NNVM_REGISTER_OP(_contrib_conv2d_NCHWc_int8_prepacked) +.describe(R"code(2D convolution layer in int8 using prepacked data and kernel. +)code" NNVM_ADD_FILELINE) +.add_argument("data", "5D Tensor", "Packed input data.") +.add_argument("weight", "6D Tensor", "Packed weight matrix.") +.add_argument("bias", "1D Tensor", "Bias parameter.") +.add_arguments(Conv2DParam::__FIELDS__()) +.set_attr_parser(ParamParser) +.set_attr("FGetAttrDict", ParamGetAttrDict) +.set_attr("FListInputNames", UseBiasListInputNames) +.set_attr("FInferShape", Conv2DInferShape) +.set_attr("FInferType", Conv2DInferType) +.set_attr("FCorrectLayout", Conv2DCorrectLayout) +.set_num_outputs(1) +.set_num_inputs(UseBiasNumInputs) +.set_support_level(2); NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform) .describe(R"code(Weight transformation of winograd fast convolution algorithm. diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py index 980be6e634d5..450e4e3b442e 100644 --- a/topi/python/topi/cuda/conv2d.py +++ b/topi/python/topi/cuda/conv2d.py @@ -9,7 +9,7 @@ from .conv2d_direct import schedule_direct_cuda from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda -from .conv2d_int8 import conv2d_int8_NCHWc, schedule_conv2d_int8_NCHWc +from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8 @autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8']) @@ -103,9 +103,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype, pre_computed=False) if cfg.template_key == 'int8': - return conv2d_int8_NCHWc(cfg, data, kernel, strides, padding, layout, out_dtype) + return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype, + pre_computed=False) - print(cfg.template_key) if layout == 'NCHW': return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype) elif layout == 'HWCN': @@ -145,8 +145,8 @@ def _callback(op): schedule_direct_cuda(cfg, s, op.output(0)) if op.tag == 'conv2d_nchw_winograd': schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False) - if op.tag == "conv2d_int8_NCHWc": - schedule_conv2d_int8_NCHWc(cfg, s, op.output(0)) + if op.tag == "conv2d_NCHWc_int8": + schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=False) traverse_inline(s, outs[0].op, _callback) return s diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index 43626c60b78c..de67ef25a3a3 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -3,13 +3,15 @@ import tvm from tvm import autotvm +from ..generic import schedule_conv2d_NCHWc_int8_prepacked from .tensor_intrin import dp4a +from ..nn.conv2d import conv2d_NCHWc_int8_prepacked from ..nn.pad import pad from ..nn.util import get_pad_tuple -from ..util import get_const_tuple, get_const_int +from ..util import get_const_tuple, get_const_int, traverse_inline -def _conv2d_int8_NCHWc_arg_to_workload(data, kernel, stride, padding, layout, out_dtype): +def _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype): """convert argument to workload""" shape = get_const_tuple(data.shape) if len(shape) == 5: @@ -23,15 +25,15 @@ def _conv2d_int8_NCHWc_arg_to_workload(data, kernel, stride, padding, layout, ou if len(shape) == 6: oc_chunk, ic_chunk, KH, KW, oc_block, ic_block = shape raw_kernel = tvm.placeholder( - (oc_chunk, ic_chunk, KH, KW, oc_block, ic_block), dtype=kernel.dtype) + (oc_chunk*oc_block, ic_chunk*ic_block, KH, KW), dtype=kernel.dtype) else: raw_kernel = kernel return ('conv2d', ) + autotvm.task.task.args_to_workload( - [raw_data, raw_kernel, stride, padding, out_dtype]) + [raw_data, raw_kernel, stride, padding, "NCHW", out_dtype]) -def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype): +def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre_computed): """Convolution operator in NCHW[x]c layout for int8. Parameters @@ -57,20 +59,23 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype): layout : str layout of data - out_dtype: str + out_dtype : str The output type. This is used for mixed precision. + pre_computed : str + Whether packed data and kernel are pre-computed + Returns ------- output : tvm.Tensor 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ - assert layout == 'NCHW' + assert layout == "NCHW" or layout == "NCHW4c" ic_block_factor = 4 oc_block_factor = 4 - if len(data.shape) == 4: + if not pre_computed: batch, channels, height, width = get_const_tuple(data.shape) assert channels % ic_block_factor == 0, \ "Number of input channels should be multiple of {}".format( @@ -79,10 +84,7 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype): lambda n, c, h, w, vc: kernel[n, c*ic_block_factor + vc, h, w], name="packed_data") - else: - packed_data = data - if len(kernel.shape) == 4: out_channels, in_channels, kernel_h, kernel_w = get_const_tuple( kernel.shape) assert out_channels % 4 == 0, \ @@ -95,7 +97,9 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype): kernel[oc_chunk * oc_block_factor + oc_block, ic_chunk * ic_block_factor + ic_block, kh, kw], name="packed_kernel") + else: + packed_data = data packed_kernel = kernel batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple( @@ -134,9 +138,9 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype): output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype), - tag="conv2d_int8_NCHWc", - attrs={"workload": _conv2d_int8_NCHWc_arg_to_workload( - data, kernel, stride, padding, layout, out_dtype)}) + tag="conv2d_NCHWc_int8", + attrs={"workload": _conv2d_NCHWc_int8_arg_to_workload( + data, kernel, stride, padding, out_dtype)}) # num flop num_flop = batch * oc_chunk * oc_block * out_height * out_width * \ @@ -149,7 +153,7 @@ def conv2d_int8_NCHWc(cfg, data, kernel, stride, padding, layout, out_dtype): _dp4a = dp4a('shared', 'shared', 'local') -def schedule_conv2d_int8_NCHWc(cfg, s, output): +def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): """Schedule conv2d int8 NCHWc template""" workload = output.op.attrs["workload"] @@ -164,14 +168,14 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output): else: pad_data = packed_data - if autotvm.GLOBAL_SCOPE.in_tuning: - # skip this part during tuning to make recrods accurate - # this part will be pre-computed during NNVM's pre-compute optimization pass - s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region") - s[packed_kernel].pragma( - s[packed_kernel].op.axis[0], "debug_skip_region") - else: - if isinstance(packed_data.op, tvm.tensor.ComputeOp): + if not pre_computed: + if autotvm.GLOBAL_SCOPE.in_tuning: + # skip this part during tuning to make recrods accurate + # this part will be pre-computed during NNVM's pre-compute optimization pass + s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region") + s[packed_kernel].pragma( + s[packed_kernel].op.axis[0], "debug_skip_region") + else: s[packed_data].compute_inline() if isinstance(packed_kernel.op, tvm.tensor.ComputeOp): s[packed_kernel].compute_inline() @@ -244,7 +248,7 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output): for load in [AA, WW]: if load == AA: n, f, y, x, c = s[load].op.axis - if pad_data != packed_data and stride_h == 1 and stride_w == 1: + if pad_data == packed_data and stride_h == 1 and stride_w == 1: s[load].vectorize(c) fused = s[load].fuse(n, f, y, x) else: @@ -263,12 +267,12 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output): s[load].bind(tx, tvm.thread_axis("threadIdx.x")) # double buffer - for load in [AA, WW]: - name = load.op.name + '_double_buffer' - cfg.define_knob(name, [0, 1]) - - if cfg[name].val: - s[load].double_buffer() + cfg.define_knob('AA_double_buffer', [0, 1]) + cfg.define_knob('WW_double_buffer', [0, 1]) + if cfg['AA_double_buffer'].val: + s[AA].double_buffer() + if cfg['WW_double_buffer'].val: + s[WW].double_buffer() # unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) @@ -277,3 +281,43 @@ def schedule_conv2d_int8_NCHWc(cfg, s, output): s[output].pragma(kernel_scope, 'unroll_explicit', False) return s + + +@conv2d_NCHWc_int8_prepacked.register(["cuda", "gpu"]) +@autotvm.task.dispatcher +def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout, out_dtype): + assert layout == 'NCHW4c' + return _conv2d_NCHWc_int8_arg_to_workload(data, kernel, stride, padding, out_dtype) + + +@conv2d_NCHWc_int8_prepacked_dispatcher.register("int8") +def _decl_conv2d_NCHWc_int8_prepacked(cfg, data, kernel, stride, padding, layout, out_dtype): + return conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, + pre_computed=True) + +@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda", "gpu"], ["int8"]) +def schedule_conv2d_NCHWc_int8_prepacked_cuda(cfg, outs): + """TOPI schedule callback of conv2d for cuda gpu + + Parameters + ---------- + cfg: ConfigEntity + The config for this template + + outs: Array of Tensor + The computation graph description of conv2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d. + """ + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'conv2d_NCHWc_int8' in op.tag: + schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=True) + + traverse_inline(s, outs[0].op, _callback) + return s diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index 7e0574ea606b..f2b6d960135c 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -375,6 +375,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos): if cfg.template_key == 'direct': return None + if cfg.template_key == 'int8': + new_attrs['layout'] = 'NCHW4c' + new_attrs['out_layout'] = 'NCHW4c' + new_attrs['kernel_layout'] = 'OIHW4o4i' + return sym.contrib.conv2d_NCHWc_int8_prepacked(*copy_inputs, **new_attrs) + # pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 874decc792ec..e99ce263296b 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -139,6 +139,24 @@ def schedule_conv2d_winograd_without_weight_transform(outs): return _default_schedule(outs, False) +@tvm.target.generic_func +def schedule_conv2d_NCHWc_int8_prepacked(outs): + """Schedule for conv2d NCHWc int8 with prepacked data and kernel + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of this operator + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + @tvm.target.generic_func def schedule_conv2d_transpose_nchw(outs): """Schedule for conv2d_transpose_nchw diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index e0d2c403d4b4..e389b1eb4bfc 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -397,3 +397,37 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding, 4-D with shape [batch, out_height, out_width, out_channel] """ raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform") + + +@tvm.target.generic_func +def conv2d_NCHWc_int8_prepacked(data, kernel, stride, padding, layout, out_dtype): + """Convolution operator in NCHW[x]c layout for int8. Data and kernel should be packed in + advance. + + Parameters + ---------- + data : tvm.Tensor + 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] + + kernel : tvm.Tensor + 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, + filter_width, num_filter_block, in_channel_block] + + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + + padding: int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + + layout : str + layout of data + + out_dtype: str + The output type. This is used for mixed precision. + + Returns + ------- + output : tvm.Tensor + 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] + """ + raise ValueError("missing register for topi.nn.conv2d_NCHWc_int8_prepacked") From 35df666aec30ebbe7169bb19240a9ec1ee3fec4a Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 20 Sep 2018 10:38:33 +0800 Subject: [PATCH 03/16] Fix computing packed_kernel --- topi/python/topi/cuda/conv2d_int8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index de67ef25a3a3..4d666ca96e4b 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -177,7 +177,6 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): s[packed_kernel].op.axis[0], "debug_skip_region") else: s[packed_data].compute_inline() - if isinstance(packed_kernel.op, tvm.tensor.ComputeOp): s[packed_kernel].compute_inline() if pad_data != packed_data: From 5a2e6ff81a90f835a57a8200fccc187344c50974 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 20 Sep 2018 10:44:38 +0800 Subject: [PATCH 04/16] Fix style --- nnvm/python/nnvm/top/nn.py | 7 ++++--- topi/python/topi/cuda/conv2d.py | 2 +- topi/python/topi/cuda/conv2d_int8.py | 15 ++++++++------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index cee0d7741977..bdf949ff4d87 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -209,7 +209,8 @@ def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _): assert dilation == (1, 1), "not support dilate now" if groups == 1: # pylint: disable=assignment-from-no-return - out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout, out_dtype) + out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout, + out_dtype) # pylint: enable=assignment-from-no-return else: raise ValueError("not support arbitrary group number > 1 for now") @@ -218,7 +219,7 @@ def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _): bias = topi.expand_dims(bias, axis=1, num_newaxis=2) out = topi.add(out, bias) return out - + @reg.register_schedule("_contrib_conv2d_NCHWc_int8_prepacked") def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target): with tvm.target.create(target): @@ -227,7 +228,7 @@ def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target): reg.register_pattern("_contrib_conv2d_NCHWc_int8", OpPattern.OUT_ELEMWISE_FUSABLE) - + @reg.register_compute("_contrib_conv2d_winograd_weight_transform") def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _): return topi.nn.conv2d_winograd_weight_transform(inputs[0], attrs.get_int('tile_size')) diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py index 450e4e3b442e..4dac40746419 100644 --- a/topi/python/topi/cuda/conv2d.py +++ b/topi/python/topi/cuda/conv2d.py @@ -104,7 +104,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f pre_computed=False) if cfg.template_key == 'int8': return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype, - pre_computed=False) + pre_computed=False) if layout == 'NCHW': return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index 4d666ca96e4b..5dcc8a52e209 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -70,7 +70,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre output : tvm.Tensor 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ - assert layout == "NCHW" or layout == "NCHW4c" + assert layout in ["NCHW", "NCHW4c"] ic_block_factor = 4 oc_block_factor = 4 @@ -80,7 +80,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre assert channels % ic_block_factor == 0, \ "Number of input channels should be multiple of {}".format( ic_block_factor) - packed_data = tvm.compute((batch, channels/ic_block_factor, height, width, ic_block_factor), + packed_data = tvm.compute((batch, channels // ic_block_factor, height, width, + ic_block_factor), lambda n, c, h, w, vc: kernel[n, c*ic_block_factor + vc, h, w], name="packed_data") @@ -91,7 +92,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre "Number of output channels should be multiple of {}".format( oc_block_factor) packed_kernel = tvm.compute( - (out_channels / oc_block_factor, in_channels / ic_block_factor, kernel_h, kernel_w, + (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w, oc_block_factor, ic_block_factor), lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: kernel[oc_chunk * oc_block_factor + oc_block, @@ -282,7 +283,7 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): return s -@conv2d_NCHWc_int8_prepacked.register(["cuda", "gpu"]) +@conv2d_NCHWc_int8_prepacked.register(["cuda"]) @autotvm.task.dispatcher def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout, out_dtype): assert layout == 'NCHW4c' @@ -292,11 +293,11 @@ def conv2d_NCHWc_int8_prepacked_dispatcher(data, kernel, stride, padding, layout @conv2d_NCHWc_int8_prepacked_dispatcher.register("int8") def _decl_conv2d_NCHWc_int8_prepacked(cfg, data, kernel, stride, padding, layout, out_dtype): return conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, - pre_computed=True) + pre_computed=True) -@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda", "gpu"], ["int8"]) +@autotvm.register_topi_schedule(schedule_conv2d_NCHWc_int8_prepacked, ["cuda"], ["int8"]) def schedule_conv2d_NCHWc_int8_prepacked_cuda(cfg, outs): - """TOPI schedule callback of conv2d for cuda gpu + """TOPI schedule callback of conv2d for cuda Parameters ---------- From 74bced573a46754c41a282733e06d8ffa89899e8 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 20 Sep 2018 13:50:04 +0800 Subject: [PATCH 05/16] Compute flop in conv2d_direct manually to support int8 --- topi/python/topi/cuda/conv2d_direct.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py index 19e7ea38f647..9b315a6b0fc1 100644 --- a/topi/python/topi/cuda/conv2d_direct.py +++ b/topi/python/topi/cuda/conv2d_direct.py @@ -2,6 +2,7 @@ """The templates for cuda conv2d operators""" import tvm from tvm import autotvm +from ..util import get_const_tuple def schedule_direct_cuda(cfg, s, conv): """schedule optimized for batch size = 1""" @@ -94,3 +95,7 @@ def schedule_direct_cuda(cfg, s, conv): # unroll s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) + + N, CO, OH, OW = get_const_tuple(output.shape) + _, KH, KW, CI = get_const_tuple(kernel.shape) + cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW) From b673f008de4c654d3ca583d43d44aa4ad1a55cd5 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 21 Sep 2018 10:41:25 +0800 Subject: [PATCH 06/16] Handle bias in conv2d_int8 schedule --- topi/python/topi/cuda/conv2d_int8.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index 5dcc8a52e209..d6da1eb1e1cf 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -183,8 +183,6 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): if pad_data != packed_data: s[pad_data].compute_inline() - s[conv].set_scope('local') - batch = get_const_int(packed_data.shape[0]) if isinstance(stride, int): stride_h = stride_w = stride @@ -195,6 +193,13 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): AA = s.cache_read(pad_data, 'shared', [conv]) WW = s.cache_read(packed_kernel, 'shared', [conv]) + s[conv].set_scope('local') + + # handle bias + if output.op not in s.outputs: + s[output].compute_inline() + output = s.outputs[0].output(0) + # tile and bind spatial axes n, f, y, x, c = s[output].op.axis cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) From c48e1fb40349693fd9a70204c0f3aed7f274d8cd Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 21 Sep 2018 11:33:57 +0800 Subject: [PATCH 07/16] Fix wrong variable --- topi/python/topi/cuda/conv2d_int8.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index d6da1eb1e1cf..a1b73691d5cc 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -82,8 +82,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre ic_block_factor) packed_data = tvm.compute((batch, channels // ic_block_factor, height, width, ic_block_factor), - lambda n, c, h, w, vc: kernel[n, - c*ic_block_factor + vc, h, w], + lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w], name="packed_data") out_channels, in_channels, kernel_h, kernel_w = get_const_tuple( From a037241b0ba7710b07eb11368ba7d5d6a6d7032c Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 21 Sep 2018 14:02:24 +0800 Subject: [PATCH 08/16] Improve style --- topi/python/topi/cuda/conv2d_int8.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index a1b73691d5cc..03bbfc0f876b 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -107,8 +107,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, layout, out_dtype, pre oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple( packed_kernel.shape) - stride_h, stride_w = (stride, stride) if isinstance( - stride, int) else stride + if isinstance(stride, int): + stride_h, stride_w = stride + else: + stride_h, stride_w = stride pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (kernel_h, kernel_w)) From d685e0d1419e1bf6b0fe0ed8db2b8b14f9349edc Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 21 Sep 2018 18:48:28 +0800 Subject: [PATCH 09/16] Handle dilate and packing schedule correctly --- topi/python/topi/cuda/conv2d_int8.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index 03bbfc0f876b..4e1b2672bed2 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -3,6 +3,7 @@ import tvm from tvm import autotvm +from .injective import _schedule_injective from ..generic import schedule_conv2d_NCHWc_int8_prepacked from .tensor_intrin import dp4a from ..nn.conv2d import conv2d_NCHWc_int8_prepacked @@ -171,6 +172,7 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): pad_data = packed_data if not pre_computed: + kernel, = packed_kernel.op.input_tensors if autotvm.GLOBAL_SCOPE.in_tuning: # skip this part during tuning to make recrods accurate # this part will be pre-computed during NNVM's pre-compute optimization pass @@ -178,8 +180,13 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): s[packed_kernel].pragma( s[packed_kernel].op.axis[0], "debug_skip_region") else: - s[packed_data].compute_inline() - s[packed_kernel].compute_inline() + _schedule_injective(packed_data.op, s) + _schedule_injective(packed_kernel.op, s) + else: + kernel = packed_data + + if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() if pad_data != packed_data: s[pad_data].compute_inline() From 0809b0ea30b3fd74aa06ee464ed1bc044556c8be Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 21 Sep 2018 19:23:58 +0800 Subject: [PATCH 10/16] Add unittests of conv2d_NCHWc_int8 --- topi/tests/python/test_topi_conv2d_int8.py | 173 +++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 topi/tests/python/test_topi_conv2d_int8.py diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py new file mode 100644 index 000000000000..bb441891155f --- /dev/null +++ b/topi/tests/python/test_topi_conv2d_int8.py @@ -0,0 +1,173 @@ +"""Example code to do convolution.""" + +import numpy as np +import tvm +from tvm import autotvm +from tvm.autotvm.task.space import FallbackConfigEntity +import topi +import topi.testing +from tvm.contrib.pickle_memoize import memoize +from topi.util import get_const_tuple + +from common import get_all_backend + +oc_block_factor = 4 + + +def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False): + print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) + + in_height = in_width = in_size + + A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8') + W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8') + bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias', + dtype='int8') + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + bias_shape = get_const_tuple(bias.shape) + dtype = A.dtype + + @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") + def get_ref_data(): + a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) + w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) + b_np = np.random.uniform(size=bias_shape).astype(dtype) + dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) + c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) + + # convert to NCHWc + _, _, out_height, out_width = c_np.shape + c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \ + out_height, out_width)).transpose(0, 1, 3, 4, 2) + + if add_bias: + b_np = np.random.uniform(size=bias_shape).astype(dtype) + c_np += b_np + if add_relu: + c_np = np.maximum(c_np, 0) + + return a_np, w_np, b_np, c_np + + a_np, w_np, b_np, c_np = get_ref_data() + + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + dW = topi.nn.dilate(W, (1, 1, dilation, dilation)) + C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding), + layout='NCHW', out_dtype=dtype) + if add_bias: + C = topi.add(C, bias) + if add_relu: + C = topi.nn.relu(C) + s = topi.generic.schedule_conv2d_nchw([C]) + + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + if add_bias: + tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) + func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) + func(a, w, b, c) + else: + func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) + func(a, w, c) + np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + + for device in ["cuda"]: + check_device(device) + + +class NCHWcInt8Fallback(autotvm.FallbackContext): + def _query_inside(self, target, workload): + key = (target, workload) + if key in self.memory: + return self.memory[key] + cfg = FallbackConfigEntity() + cfg.template_key = 'int8' + self.memory[key] = cfg + return cfg + + +def test_conv2d_nchw(): + with NCHWcInt8Fallback(): + # ResNet18 workloads where channels in / out are multiple of oc_block_factor + verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 64, 56, 64, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 64, 56, 128, 3, 2, 1) + verify_conv2d_NCHWc_int8(1, 64, 56, 128, 1, 2, 0) + verify_conv2d_NCHWc_int8(1, 128, 28, 128, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 128, 28, 256, 3, 2, 1) + verify_conv2d_NCHWc_int8(1, 128, 28, 256, 1, 2, 0) + verify_conv2d_NCHWc_int8(1, 256, 14, 256, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 256, 14, 512, 3, 2, 1) + verify_conv2d_NCHWc_int8(1, 256, 14, 512, 1, 2, 0) + verify_conv2d_NCHWc_int8(1, 512, 7, 512, 3, 1, 1) + + # bias, relu + verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_relu=True) + verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True) + verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True) + + # batch size + verify_conv2d_NCHWc_int8(4, 64, 56, 64, 3, 1, 1) + verify_conv2d_NCHWc_int8(9, 64, 56, 64, 3, 1, 1) + + # weird workloads + verify_conv2d_NCHWc_int8(4, 4, 4, 4, 4, 4, 4) + + # inception v3 workloads where channels in / out are multiple of oc_block_factor + verify_conv2d_NCHWc_int8(1, 32, 149, 32, 3, 1, 0) + verify_conv2d_NCHWc_int8(1, 32, 147, 64, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 64, 73, 80, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 80, 73, 192, 3, 1, 0) + verify_conv2d_NCHWc_int8(1, 192, 35, 64, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 192, 35, 48, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 48, 35, 64, 5, 1, 2) + verify_conv2d_NCHWc_int8(1, 64, 35, 96, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 96, 35, 96, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 192, 35, 32, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 256, 35, 64, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 256, 35, 48, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 288, 35, 64, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 288, 35, 48, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 288, 35, 384, 3, 2, 0) + verify_conv2d_NCHWc_int8(1, 96, 35, 96, 3, 2, 0) + verify_conv2d_NCHWc_int8(1, 768, 17, 192, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 768, 17, 128, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 128, 17, 128, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 128, 17, 192, 7, 1, 3) + verify_conv2d_NCHWc_int8(1, 128, 17, 128, 7, 1, 3) + verify_conv2d_NCHWc_int8(1, 128, 17, 192, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 768, 17, 160, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 160, 17, 160, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 160, 17, 192, 7, 1, 3) + verify_conv2d_NCHWc_int8(1, 160, 17, 160, 7, 1, 3) + verify_conv2d_NCHWc_int8(1, 160, 17, 192, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 192, 17, 192, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 192, 17, 192, 7, 1, 3) + verify_conv2d_NCHWc_int8(1, 192, 17, 320, 3, 2, 0) + verify_conv2d_NCHWc_int8(1, 192, 17, 192, 3, 2, 0) + verify_conv2d_NCHWc_int8(1, 1280, 8, 320, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 1280, 8, 384, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 384, 8, 384, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 384, 8, 384, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 1280, 8, 448, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 448, 8, 384, 3, 1, 1) + verify_conv2d_NCHWc_int8(1, 1280, 8, 192, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 2048, 8, 320, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 2048, 8, 384, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 2048, 8, 448, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 2048, 8, 192, 1, 1, 0) + verify_conv2d_NCHWc_int8(1, 1024, 19, 84, 3, 1, 1) + + +if __name__ == "__main__": + test_conv2d_nchw() From 9553a63aa5d1e56257c555427b7156c80328102d Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 25 Sep 2018 13:37:42 +0800 Subject: [PATCH 11/16] Add device compatability check in unittest --- topi/tests/python/test_topi_conv2d_int8.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py index bb441891155f..2b85b2b97cb1 100644 --- a/topi/tests/python/test_topi_conv2d_int8.py +++ b/topi/tests/python/test_topi_conv2d_int8.py @@ -21,7 +21,7 @@ def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, str A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8') - bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias', + bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias', dtype='int8') a_shape = get_const_tuple(A.shape) @@ -57,6 +57,10 @@ def check_device(device): if not ctx.exist: print("Skip because %s is not enabled" % device) return + if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): + print("Skip because int8 intrinsics are not available") + return + print("Running on target: %s" % device) with tvm.target.create(device): dW = topi.nn.dilate(W, (1, 1, dilation, dilation)) From ecad8bf05e80804218f8ef02bbc5c4337d247783 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 25 Sep 2018 16:06:01 +0800 Subject: [PATCH 12/16] Use sym.conv2d instead of new symbol --- nnvm/python/nnvm/top/nn.py | 47 ++++++------------------ nnvm/src/top/nn/convolution.cc | 17 --------- topi/python/topi/cuda/conv2d_winograd.py | 2 +- 3 files changed, 13 insertions(+), 53 deletions(-) diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index bdf949ff4d87..49192cacd713 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -90,10 +90,12 @@ def compute_conv2d(attrs, inputs, _): kernel_layout = attrs["kernel_layout"] out_dtype = attrs["out_dtype"] out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype - assert layout == "NCHW" or layout == "NHWC" + assert layout in ["NCHW", "NHWC", "NCHW4c"] (dilation_h, dilation_w) = dilation if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") + elif layout == "NCHW4c" and (dilation_h > 1 or dilation_w > 1): + raise ValueError("not support dilate now") elif dilation == (1, 1): kernel = inputs[1] elif layout == "NCHW": @@ -101,7 +103,12 @@ def compute_conv2d(attrs, inputs, _): else: #layout == NHWC kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1]) - if groups == 1: + if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8': + # pylint: disable=assignment-from-no-return + out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], kernel, strides, padding, + layout, out_dtype=out_dtype) + # pylint: enable=assignment-from-no-return + elif groups == 1: out = topi.nn.conv2d( inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype) elif layout == "NCHW" and \ @@ -120,7 +127,7 @@ def compute_conv2d(attrs, inputs, _): if attrs.get_bool("use_bias"): bias = inputs[2] - expand_axis = 1 if layout == "NCHW" else 0 + expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0 bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2) out = topi.add(out, bias) return out @@ -136,6 +143,8 @@ def schedule_conv2d(attrs, outs, target): with tvm.target.create(target): if groups == 1 and layout == "NCHW": return topi.generic.schedule_conv2d_nchw(outs) + elif groups == 1 and layout == "NCHW4c": + return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs) elif groups == 1 and layout == "NHWC": return topi.generic.schedule_conv2d_nhwc(outs) elif groups == channels and layout == "NCHW": @@ -196,38 +205,6 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target): reg.register_pattern("_contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE) -@reg.register_compute("_contrib_conv2d_NCHWc_int8_prepacked") -def compute_contrib_conv2d_NCHWc_int8_prepacked(attrs, inputs, _): - """Compute definition of int8 conv2d NCHWc""" - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - groups = attrs.get_int("groups") - layout = attrs.get_string("layout") - out_dtype = attrs.get_string("out_dtype") - out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype - assert dilation == (1, 1), "not support dilate now" - if groups == 1: - # pylint: disable=assignment-from-no-return - out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], inputs[1], strides, padding, layout, - out_dtype) - # pylint: enable=assignment-from-no-return - else: - raise ValueError("not support arbitrary group number > 1 for now") - if attrs.get_bool("use_bias"): - bias = inputs[2] - bias = topi.expand_dims(bias, axis=1, num_newaxis=2) - out = topi.add(out, bias) - return out - -@reg.register_schedule("_contrib_conv2d_NCHWc_int8_prepacked") -def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target): - with tvm.target.create(target): - return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs) - -reg.register_pattern("_contrib_conv2d_NCHWc_int8", OpPattern.OUT_ELEMWISE_FUSABLE) - - @reg.register_compute("_contrib_conv2d_winograd_weight_transform") def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _): diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc index 0a6e2d09b73f..22bda048a0a2 100644 --- a/nnvm/src/top/nn/convolution.cc +++ b/nnvm/src/top/nn/convolution.cc @@ -344,23 +344,6 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc) .set_num_inputs(UseBiasNumInputs) .set_support_level(2); -NNVM_REGISTER_OP(_contrib_conv2d_NCHWc_int8_prepacked) -.describe(R"code(2D convolution layer in int8 using prepacked data and kernel. -)code" NNVM_ADD_FILELINE) -.add_argument("data", "5D Tensor", "Packed input data.") -.add_argument("weight", "6D Tensor", "Packed weight matrix.") -.add_argument("bias", "1D Tensor", "Bias parameter.") -.add_arguments(Conv2DParam::__FIELDS__()) -.set_attr_parser(ParamParser) -.set_attr("FGetAttrDict", ParamGetAttrDict) -.set_attr("FListInputNames", UseBiasListInputNames) -.set_attr("FInferShape", Conv2DInferShape) -.set_attr("FInferType", Conv2DInferType) -.set_attr("FCorrectLayout", Conv2DCorrectLayout) -.set_num_outputs(1) -.set_num_inputs(UseBiasNumInputs) -.set_support_level(2); - NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform) .describe(R"code(Weight transformation of winograd fast convolution algorithm. Separate this into another nnvm symbol in order to enable Precompute Pass to compute the diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index f2b6d960135c..588859805c66 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -379,7 +379,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos): new_attrs['layout'] = 'NCHW4c' new_attrs['out_layout'] = 'NCHW4c' new_attrs['kernel_layout'] = 'OIHW4o4i' - return sym.contrib.conv2d_NCHWc_int8_prepacked(*copy_inputs, **new_attrs) + return sym.conv2d(*copy_inputs, **new_attrs) # pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) From 71e4754b6808adca9fe8b5a8137ff23b6aa63d9a Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 25 Sep 2018 17:17:44 +0800 Subject: [PATCH 13/16] Add assert on target --- topi/python/topi/cuda/conv2d_winograd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index 588859805c66..573e1375f0c7 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -376,6 +376,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos): return None if cfg.template_key == 'int8': + assert tvm.target.current_target() == 'cuda' new_attrs['layout'] = 'NCHW4c' new_attrs['out_layout'] = 'NCHW4c' new_attrs['kernel_layout'] = 'OIHW4o4i' From 7bc4c09b1b7255ce1c571d94dc1dd18cbd475d91 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 26 Sep 2018 10:31:21 +0800 Subject: [PATCH 14/16] Fix assert on target --- topi/python/topi/cuda/conv2d_winograd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index 573e1375f0c7..6a0a126b9e4f 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -376,7 +376,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos): return None if cfg.template_key == 'int8': - assert tvm.target.current_target() == 'cuda' + assert 'cuda' in tvm.target.current_target().keys new_attrs['layout'] = 'NCHW4c' new_attrs['out_layout'] = 'NCHW4c' new_attrs['kernel_layout'] = 'OIHW4o4i' From d8492ceb25ac18cce060dd166f1068590c76ef87 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 26 Sep 2018 18:35:29 +0800 Subject: [PATCH 15/16] Bind block index to f, y, x --- topi/python/topi/cuda/conv2d_int8.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index 4e1b2672bed2..a7e673f25f12 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -221,12 +221,8 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): # this is the scope to attach global config inside this kernel kernel_scope, n = s[output].split(n, nparts=1) - max_block_z = 128 - if batch > max_block_z: - _, n = s[output].split(n, factor=max_block_z) - s[output].bind(n, tvm.thread_axis("blockIdx.z")) - - s[output].bind(bf, tvm.thread_axis("blockIdx.y")) + s[output].bind(bf, tvm.thread_axis("blockIdx.z")) + s[output].bind(by, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vy, tvm.thread_axis("vthread")) From 4521c6a6e100bc2ccdf900fa58cd8c3a838d4ce5 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 27 Sep 2018 10:16:56 +0800 Subject: [PATCH 16/16] Fuse by and bx --- topi/python/topi/cuda/conv2d_int8.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index a7e673f25f12..053c9bc6bd31 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -221,16 +221,20 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output, pre_computed): # this is the scope to attach global config inside this kernel kernel_scope, n = s[output].split(n, nparts=1) - s[output].bind(bf, tvm.thread_axis("blockIdx.z")) - s[output].bind(by, tvm.thread_axis("blockIdx.y")) - s[output].bind(bx, tvm.thread_axis("blockIdx.x")) + max_block_z = 128 + if batch > max_block_z: + _, n = s[output].split(n, factor=max_block_z) + s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) + fused_byx = s[output].fuse(by, bx) + s[output].bind(n, tvm.thread_axis("blockIdx.z")) + s[output].bind(bf, tvm.thread_axis("blockIdx.y")) + s[output].bind(fused_byx, tvm.thread_axis("blockIdx.x")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vy, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(tf, tvm.thread_axis("threadIdx.z")) s[output].bind(ty, tvm.thread_axis("threadIdx.y")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) - s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) s[conv].compute_at(s[output], tx)