From 271dac31d975c662503bc6b0564617eb5a7d1ee1 Mon Sep 17 00:00:00 2001 From: Wang Date: Tue, 25 Sep 2018 12:57:55 -0700 Subject: [PATCH 01/13] AutoTVM for x86 conv2d --- topi/python/topi/x86/conv2d.py | 357 +++++++++++++++------- topi/python/topi/x86/conv2d_avx_1x1.py | 176 +++++++---- topi/python/topi/x86/conv2d_avx_common.py | 186 +++++++---- tutorials/autotvm/tune_nnvm_x86.py | 192 ++++++++++++ 4 files changed, 677 insertions(+), 234 deletions(-) create mode 100644 tutorials/autotvm/tune_nnvm_x86.py diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 6fe59a909510..f20494d5c0ff 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -1,11 +1,13 @@ -# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member """Conv2D schedule on x86""" import tvm +from tvm import autotvm +from tvm.autotvm.task.dispatcher import ApplyGraphBest from .. import generic, tag from .. import nn -from ..nn.util import infer_pad, infer_stride +from ..util import get_const_tuple from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \ - _get_workload, _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \ + _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \ _get_schedule_NCHWc_int8, _get_alter_layout_schedule, Workload from . import conv2d_avx_1x1, conv2d_avx_common @@ -194,103 +196,73 @@ def _get_schedule_NCHWc_x86_int8(wkl, layout, out_layout): def _get_alter_layout_schedule_x86(wkl): return _get_schedule_conv(wkl) -@conv2d.register("cpu") -def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): - _AVX_SCH_TO_DECL_FUNC = { - AVXConvCommonFwd: conv2d_avx_common._declaration_conv, - AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv - } - out_dtype = data.dtype if out_dtype is None else out_dtype - target = tvm.target.current_target(allow_none=False) - wkl = _get_workload(data, kernel, stride, padding, out_dtype) +def _create_schedule_template(cfg, data, kernel, strides, padding, layout): + """Create schedule configuration from input arguments""" + dshape = get_const_tuple(data.shape) + kshape = get_const_tuple(kernel.shape) if layout == 'NCHW': - sch = _get_schedule(wkl) - return _AVX_SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype) - elif layout == 'HWCN': - return nn.conv2d_hwcn(data, kernel, stride, padding, out_dtype) - elif layout == 'NHWC': - return nn.conv2d_nhwc(data, kernel, stride, padding, out_dtype) + n, ic, h, w = dshape + oc, _, kh, kw = kshape else: - raise ValueError("not support this layout {} yet".format(layout)) - - -@conv2d_alter_layout.register("cpu") -def _alter_conv2d_layout(attrs, inputs, tinfos): - import nnvm.symbol as sym - copy_inputs = [s for s in inputs] - new_attrs = {k : attrs[k] for k in attrs.keys()} - # only optimize for NCHW, groups=1 conv - if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1: - return None - - data = tinfos[0] - kernel = tinfos[1] - - import ast - padding = ast.literal_eval(attrs['padding']) - stride = ast.literal_eval(attrs['strides']) - - wkl = _get_workload(data, kernel, stride, padding, data.dtype) - sch = _get_alter_layout_schedule(wkl) - is_kernel_1x1 = isinstance(sch, AVXConv1x1Fwd) - ic_bn, oc_bn = sch.ic_bn, sch.oc_bn - - new_attrs['layout'] = 'NCHW%dc' % ic_bn - new_attrs['out_layout'] = 'NCHW%dc' % oc_bn - + raise ValueError("Not support this layout {} with " + "schedule template.".format(layout)) + is_kernel_1x1 = kh == 1 and kw == 1 + ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding) + sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) + oh = (h - kh + 2 * ph) // sh + 1 + ow = (w - kw + 2 * pw) // sw + 1 + + # Create schedule config + cfg.define_split("tile_ic", ic, num_outputs=2) + cfg.define_split("tile_oc", oc, num_outputs=2) + cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64) if is_kernel_1x1: - # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) - new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn) + cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1]) else: - # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) - new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + cfg.define_knob("unroll_kw", [True, False]) - return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) +def conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype): + """convert argument to workload""" + if len(kernel.shape) == 4: + raw_kernel = kernel + else: # the input kernel is transformed by alter_op_layout + shape = get_const_tuple(kernel.shape) + raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]), + dtype=kernel.dtype) + return ('conv2d', ) + autotvm.task.args_to_workload( + [data, raw_kernel, strides, padding, layout, out_dtype]) -@conv2d_NCHWc.register("cpu") -def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride, - padding, layout, out_layout, out_dtype): - _AVX_SCH_TO_DECL_FUNC = { - AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc, - AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc - } +@conv2d.register("cpu") +@autotvm.task.dispatcher +def conv2d_x86(data, kernel, strides, padding, layout, out_dtype): + """x86 conv2d declaration.""" + return conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype) - # Use int8 schedules if the input data is of int8 dtype - if data.dtype == 'uint8': - _AVX_SCH_TO_DECL_FUNC = { - AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc_int8, - AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc_int8 - } - n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] - ic = ic_chunk * ic_block - kh, kw = kernel_size - if data.dtype == 'uint8': - wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype), - tvm.placeholder((num_filter, ic, kh, kw), - dtype=kernel.dtype), - stride, padding, out_dtype) - sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout) +@conv2d_x86.register(["direct"]) +def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): + out_dtype = data.dtype if out_dtype is None else out_dtype + if layout == 'NCHW': + _create_schedule_template(cfg, data, kernel, strides, padding, layout) + args = [cfg, data, kernel, strides, padding, layout, out_dtype] + _, _, kh, kw = get_const_tuple(kernel.shape) + is_kernel_1x1 = kh == 1 and kw == 1 + return conv2d_avx_1x1._declaration_conv(*args) if is_kernel_1x1 else \ + conv2d_avx_common._declaration_conv(*args) + elif layout == 'HWCN': + return nn.conv2d_hwcn(data, kernel, strides, padding, out_dtype) + elif layout == 'NHWC': + return nn.conv2d_nhwc(data, kernel, strides, padding, out_dtype) else: - wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype), - tvm.placeholder((num_filter, ic, kh, kw), - dtype=kernel.dtype), - stride, padding, out_dtype) - sch = _get_schedule_NCHWc(wkl, layout, out_layout) - return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel) + raise ValueError("not support this layout {} yet".format(layout)) -@generic.schedule_conv2d_nchw.register(["cpu"]) -def schedule_conv2d(outs): +@autotvm.task.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct']) +def schedule_conv2d(cfg, outs): """Create schedule for tensors""" - _AVX_SCH_TO_SCH_FUNC = { - AVXConvCommonFwd: conv2d_avx_common._schedule_conv, - AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv - } s = tvm.create_schedule([x.op for x in outs]) - target = tvm.target.current_target(allow_none=False) scheduled_ops = [] def traverse(op): @@ -316,16 +288,15 @@ def traverse(op): if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] - padding = infer_pad(data, data_pad) - if data_pad is None: - stride = infer_stride(data, kernel, output) - else: - stride = infer_stride(data_pad, kernel, output) - wkl = _get_workload(data, kernel, stride, padding, output.dtype) - sch = _get_schedule(wkl) - _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, - kernel, kernel_vec, conv_out, output, outs[0]) + _, _, kh, kw = get_const_tuple(kernel.shape) + is_kernel_1x1 = kh == 1 and kw == 1 + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, + output, outs[0]] + if is_kernel_1x1: + conv2d_avx_1x1._schedule_conv(*args) + else: + conv2d_avx_common._schedule_conv(*args) scheduled_ops.append(op) @@ -333,7 +304,7 @@ def traverse(op): return s -@generic.schedule_conv2d_nhwc.register(["cpu"]) +@generic.schedule_conv2d_nhwc.register("cpu") def schedule_conv2d_nhwc(outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) @@ -388,12 +359,166 @@ def traverse(op): return s -@generic.schedule_conv2d_NCHWc.register(["cpu"]) -def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, +def conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, + padding, layout, out_layout, out_dtype): + """convert argument to workload""" + dshape = get_const_tuple(data.shape) + kshape = get_const_tuple(kernel.shape) + kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \ + (kernel_size, kernel_size) + is_kernel_1x1 = kh == 1 and kw == 1 + if len(dshape) > 4: + raw_data = tvm.placeholder((dshape[0], dshape[1] * dshape[4], dshape[2], + dshape[3]), dtype=kernel.dtype) + else: + raw_data = data + if len(kshape) > 4: + if is_kernel_1x1: + raw_kernel = tvm.placeholder((kshape[0] * kshape[3], kshape[1] * kshape[2], + kshape[4], kshape[5]), dtype=kernel.dtype) + else: + raw_kernel = tvm.placeholder((kshape[0] * kshape[5], kshape[1] * kshape[4], + kshape[2], kshape[3]), dtype=kernel.dtype) + else: + raw_kernel = kernel + return ('conv2d_NCHWc', ) + autotvm.task.args_to_workload( + [raw_data, raw_kernel, strides, padding, layout, out_layout, + out_dtype]) + + +def _get_fp32_len(): + fp32_vec_len = 8 + target = tvm.target.current_target() + if target is not None: + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + fp32_vec_len = 16 + return fp32_vec_len + + +def _query_dispatcher(workload, in_alter_op=False): + dispatch_ctx = autotvm.task.DispatchContext.current + if isinstance(dispatch_ctx, ApplyGraphBest): + if in_alter_op: + cfg = dispatch_ctx.query(None, None) + else: + cfg = dispatch_ctx.query_global_dict(workload) + else: + target = tvm.target.current_target() + cfg = dispatch_ctx.query(target, workload) + if cfg.is_fallback: + fp32_vec_len = _get_fp32_len() + _, _, kh, kw, _ = workload[2] + is_kernel_1x1 = kh == 1 and kw == 1 + if is_kernel_1x1: + cfg = conv2d_avx_1x1._fallback_schedule(workload, fp32_vec_len) + else: + cfg = conv2d_avx_common._fallback_schedule(workload, fp32_vec_len) + return cfg + + +@conv2d_alter_layout.register("cpu") +def _alter_conv2d_layout(attrs, inputs, tinfo): + import nnvm.symbol as sym + copy_inputs = [s for s in inputs] + new_attrs = {k : attrs[k] for k in attrs.keys()} + data, kernel = tinfo[0], tinfo[1] + # only optimize for NCHW, groups=1 conv + if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1: + return None + + import ast + kernel_size = ast.literal_eval(attrs["kernel_size"]) + padding = ast.literal_eval(attrs["padding"]) + strides = ast.literal_eval(attrs["strides"]) + layout = attrs['layout'] + out_layout = layout if attrs["out_layout"] == "__undef__" else attrs["out_layout"] + + dtype = data.dtype + out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"] + kh, kw = kernel_size + is_kernel_1x1 = kh == 1 and kw == 1 + workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, + padding, layout, out_layout, out_dtype) + cfg = _query_dispatcher(workload, True) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + new_attrs['layout'] = 'NCHW%dc' % ic_bn + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + # Store global schedule dictionary for ApplyGraphBest dispatcher + dispatch_ctx = autotvm.task.DispatchContext.current + if isinstance(dispatch_ctx, ApplyGraphBest): + workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, + padding, new_attrs['layout'], + new_attrs['out_layout'], out_dtype) + global_dict_key = workload + dispatch_ctx.update_global_dict(global_dict_key, cfg) + + if is_kernel_1x1: + # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) + new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn) + else: + # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) + new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + + return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) + + +@conv2d_NCHWc.register("cpu") +def conv2d_NCHWc_cpu(data, kernel, num_filter, kernel_size, strides, + padding, layout, out_layout, out_dtype): + """x86 conv2d_NCHWc declaration.""" + dispatch_ctx = autotvm.task.DispatchContext.current + if not isinstance(dispatch_ctx, ApplyGraphBest): + layout = out_layout = "NCHW" + workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, + padding, layout, out_layout, out_dtype) + cfg = _query_dispatcher(workload) + return _declaration_conv_NCHWc(cfg, data, kernel, num_filter, kernel_size, strides, + padding, layout, out_layout, out_dtype) + + +def _declaration_conv_NCHWc(cfg, data, kernel, num_filter, kernel_size, strides, + padding, layout, out_layout, out_dtype): + n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] + ic = ic_chunk * ic_block + kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \ + (kernel_size, kernel_size) + is_kernel_1x1 = kh == 1 and kw == 1 + ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding) + sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) + + if data.dtype == 'uint8': + wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype), + tvm.placeholder((num_filter, ic, kh, kw), + dtype=kernel.dtype), + strides, padding, out_dtype) + sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout) + return conv2d_avx_1x1._declaration_conv_NCHWc_int8(wkl, sch, data, kernel) \ + if is_kernel_1x1 \ + else conv2d_avx_common._declaration_conv_NCHWc_int8(wkl, sch, data, kernel) + + args = [cfg, data, kernel, (kh, kw), (sh, sw), (ph, pw), layout, out_layout, out_dtype] + return conv2d_avx_1x1._declaration_conv_NCHWc(*args) if is_kernel_1x1 else \ + conv2d_avx_common._declaration_conv_NCHWc(*args) + + +@generic.schedule_conv2d_NCHWc.register("cpu") +def schedule_conv2d_NCHWc(num_filter, kernel_size, strides, padding, layout, out_layout, outs): + """x86 conv2d_NCHWc schedule""" + return _schedule_conv2d_NCHWc(None, num_filter, kernel_size, strides, padding, + layout, out_layout, outs) + + +def _schedule_conv2d_NCHWc(cfg, num_filter, kernel_size, strides, padding, + layout, out_layout, outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) scheduled_ops = [] + dispatch_ctx = autotvm.task.DispatchContext.current + if not isinstance(dispatch_ctx, ApplyGraphBest): + layout = out_layout = "NCHW" def traverse(op): """Traverse operators from computation graph""" @@ -416,18 +541,9 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] - _AVX_SCH_TO_SCH_FUNC = { - AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc, - AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc - } - - # Use int8 schedules if the input data is of int8 dtype - if data.dtype == 'uint8': - _AVX_SCH_TO_SCH_FUNC = { - AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc_int8, - AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc_int8 - } - + kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \ + (kernel_size, kernel_size) + is_kernel_1x1 = kh == 1 and kw == 1 n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype) @@ -435,16 +551,27 @@ def traverse(op): kh, kw = kernel_size original_kernel = tvm.placeholder((num_filter, ic, kh, kw), dtype=kernel.dtype) - if data.dtype == 'uint8': wkl = _get_workload_int8(original_data, original_kernel, - stride, padding, conv_out.dtype) + strides, padding, conv_out.dtype) sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout) + args = [s, wkl, sch, data_vec, kernel, conv_out, outs[0]] + if is_kernel_1x1: + conv2d_avx_1x1._schedule_conv_NCHWc_int8(*args) + else: + conv2d_avx_common._schedule_conv_NCHWc_int8(*args) else: - wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype) - sch = _get_schedule_NCHWc(wkl, layout, out_layout) - _AVX_SCH_TO_SCH_FUNC[type(sch)](s, wkl, sch, data_vec, - kernel, conv_out, outs[0]) + current_cfg = cfg + if current_cfg is None: + workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, + padding, layout, out_layout, + conv_out.dtype) + current_cfg = _query_dispatcher(workload) + args = [s, current_cfg, data_vec, conv_out, outs[0]] + if is_kernel_1x1: + conv2d_avx_1x1._schedule_conv_NCHWc(*args) + else: + conv2d_avx_common._schedule_conv_NCHWc(*args) scheduled_ops.append(op) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index bace7451d665..10c57dc3fd46 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -3,11 +3,12 @@ from __future__ import absolute_import as _abs from collections import namedtuple import tvm +from tvm.autotvm.task import ConfigEntity + import topi from ..util import get_const_tuple -from ..nn.conv2d import _get_schedule, _get_workload -from ..nn.util import infer_pad, infer_stride +from ..nn.util import infer_pad from ..nn.pad import pad from .tensor_intrin import dot_16x1x16_int8_int8_int32 from .check_targets import check_skylake @@ -42,13 +43,48 @@ def _get_default_schedule(wkl, simd_width): raise ValueError("cannot decide default schedule for workload: {}".format(wkl)) -def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): +def _fallback_schedule(wkl, simd_width): + batch_size, in_channel, height, width, _ = wkl[1] + out_channel, _, hkernel, wkernel, _ = wkl[2] + HPAD, WPAD = wkl[4] + HSTR, WSTR = wkl[3] + out_height = (height + 2 * HPAD - hkernel) // HSTR + 1 + out_width = (width + 2 * WPAD - wkernel) // WSTR + 1 + + oc_bn = 1 + for bn in range(simd_width, 0, -1): + if out_channel % bn == 0: + oc_bn = bn + break + + ic_bn = 1 + for bn in range(oc_bn, 0, -1): + if in_channel % bn == 0: + ic_bn = bn + break + + for ow_factor in range(out_width, 0, -1): + if out_width % ow_factor == 0: + for oh_factor in range(out_height, 0, -1): + if out_height % oh_factor == 0 and ow_factor * oh_factor < 32: + cfg_dict = {"i": -1, + "c": None, + "e": [["tile_ic", "sp", [in_channel // ic_bn, ic_bn]], + ["tile_oc", "sp", [out_channel // oc_bn, oc_bn]], + ["tile_oh", "ot", oh_factor], + ["tile_ow", "sp", [out_width // ow_factor, + ow_factor]],], + "t": ""} + return ConfigEntity.from_json_dict(cfg_dict) + + raise ValueError("cannot decide default schedule for workload: {}".format(wkl)) + + +def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): assert layout == 'NCHW', "only support NCHW convolution for AVX" - wkl = _get_workload(data, kernel, stride, padding, out_dtype) - sch = _get_schedule(wkl) - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride + HPAD, WPAD = padding + HSTR, WSTR = strides batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) @@ -64,40 +100,44 @@ def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data - shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) - data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w]) - shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn, sch.oc_bn, 1, 1) + # fetch schedule + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + + shape = (batch_size, in_channel // ic_bn, pad_height, pad_width, ic_bn) + data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, C * ic_bn + c, h, w]) + + shape = (num_filter // oc_bn, in_channel // ic_bn, ic_bn, oc_bn, 1, 1) kernel_vec = tvm.compute(shape, lambda CO, CI, ci, co, h, w: - kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w], + kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w], name='kernel_vec') - oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) + oshape = (batch_size, num_filter // oc_bn, out_height, out_width, oc_bn) ic = tvm.reduce_axis((0, in_channel), name='ic') conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn] * - kernel_vec[oc_chunk, ic//sch.ic_bn, ic%sch.ic_bn, oc_block, 0, 0], + tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR, ow*WSTR, ic%ic_bn] * + kernel_vec[oc_chunk, ic//ic_bn, ic%ic_bn, oc_block, 0, 0], axis=[ic]), name='conv') oshape = (batch_size, num_filter, out_height, out_width) unpack = tvm.compute(oshape, lambda n, oc, oh, ow: - conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn], - tag='conv2d_nchw') + conv[n, oc // oc_bn, oh, ow, oc % oc_bn], + tag='conv2d_nchw', + attrs={'workload': + topi.x86.conv2d.conv_arg_to_workload(data, kernel, + strides, padding, + layout, out_dtype)}) return unpack -def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last): +def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): + # fetch schedule + ic_bn, oc_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], + cfg["tile_oh"].val, cfg["tile_ow"].size[-1]) + # no stride and padding info here padding = infer_pad(data, data_pad) - if data_pad is None: - stride = infer_stride(data, kernel, output) - else: - stride = infer_stride(data_pad, kernel, output) - - wkl = _get_workload(data, kernel, stride, padding, output.dtype) - sch = _get_schedule(wkl) - - HPAD, WPAD = wkl.hpad, wkl.wpad + HPAD, WPAD = padding DOPAD = (HPAD != 0 or WPAD != 0) A, W = data, kernel_vec @@ -112,7 +152,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou # schedule kernel pack oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) - if sch.oc_bn > 1: + if oc_bn > 1: s[W].vectorize(oc_block) parallel_axis = s[W].fuse(oc_chunk, oh) s[W].parallel(parallel_axis) @@ -121,17 +161,17 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou CC = s.cache_write(C, 'global') batch, oc_chunk, oh, ow, oc_block = s[C].op.axis - oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor) + oh_outer, oh_inner = s[C].split(oh, factor=oh_factor) s[C].vectorize(oc_block) s[CC].compute_at(s[C], oh_outer) _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic, = s[CC].op.reduce_axis - ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) - oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor) - ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor) + oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor) + ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor) s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block) s[CC].vectorize(oc_block) @@ -143,9 +183,9 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou s[O0].compute_inline() batch, oc, oh, ow = s[O].op.axis - oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn) - oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor) - ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + oh_outer, oh_inner = s[O].split(oh, factor=oh_factor) + ow_outer, ow_inner = s[O].split(ow, factor=ow_factor) s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) parallel_axis = s[O].fuse(oc_chunk, oh_outer) @@ -157,14 +197,18 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou return s -def _declaration_conv_NCHWc(wkl, sch, data, kernel): - out_dtype = wkl.out_dtype - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride +def _declaration_conv_NCHWc(cfg, data, kernel, kernel_size, strides, padding, layout, + out_layout, out_dtype): + HPAD, WPAD = padding + HSTR, WSTR = strides - batch_size = data.shape[0] - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + n, ic_chunk, ih, iw, ic_block = get_const_tuple(data.shape) + ic = ic_chunk * ic_block + kh, kw = kernel_size + oc_chunk, _, _, oc_block, _, _ = get_const_tuple(kernel.shape) + oc = oc_chunk * oc_block + oh = (ih + 2 * HPAD - kh) // HSTR + 1 + ow = (iw + 2 * WPAD - kw) // WSTR + 1 DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: @@ -172,18 +216,38 @@ def _declaration_conv_NCHWc(wkl, sch, data, kernel): else: data_pad = data - oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn) - ic = tvm.reduce_axis((0, wkl.in_filter), name='ic') + # fetch schedule + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + if ic_bn != ic_block: + raise RuntimeError("ic_bn in config is not equal to actual data ic_block: %d vs %d." + % (ic_bn, ic_block)) + if oc_bn != oc_block: + raise RuntimeError("oc_bn in config is not equal to actual kernel oc_block: %d vs %d." + % (oc_bn, oc_block)) + + # convolution + workload = topi.x86.conv2d.conv_NCHWc_arg_to_workload(data, kernel, + kernel_size, + strides, padding, + layout, out_layout, + out_dtype), + attrs = {'workload': workload} + oshape = (n, oc//oc_bn, oh, ow, oc_bn) + ic = tvm.reduce_axis((0, ic), name='ic') conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn] - .astype(out_dtype) * - kernel[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn, oc_block, 0, 0], - axis=[ic]), name='conv2d_NCHWc', tag='conv2d_NCHWc') - + tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR, ow*WSTR, + ic%ic_bn].astype(out_dtype) * + kernel[oc_chunk, ic // ic_bn, ic % ic_bn, oc_block, 0, 0], + axis=[ic]), + name='conv2d_NCHWc', tag='conv2d_NCHWc', attrs=attrs) return conv -def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): +def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): + # fetch schedule + ic_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oh"].val, + cfg["tile_ow"].size[-1]) + # schedule data A = data if isinstance(s[A].op, tvm.tensor.ComputeOp): @@ -195,8 +259,8 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): CC = s.cache_write(C, 'global') batch, oc_chunk, oh, ow, oc_block = s[C].op.axis - oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor) - ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor) + oh_outer, oh_inner = s[C].split(oh, factor=oh_factor) + ow_outer, ow_inner = s[C].split(ow, factor=ow_factor) s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) s[C].vectorize(oc_block) @@ -208,10 +272,10 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic, = s[CC].op.reduce_axis - ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) - oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor) - ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor) + oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor) + ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor) s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block) s[CC].fuse(oc_chunk, oh_outer) @@ -222,8 +286,8 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): if C != O: batch, oc_chunk, oh, ow, oc_block = s[O].op.axis - oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor) - ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor) + oh_outer, oh_inner = s[O].split(oh, factor=oh_factor) + ow_outer, ow_inner = s[O].split(ow, factor=ow_factor) s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) parallel_axis = s[O].fuse(oc_chunk, oh_outer) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 0d7aba23d236..56a5b6155ef5 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -3,10 +3,12 @@ from __future__ import absolute_import as _abs from collections import namedtuple import tvm +from tvm.autotvm.task import ConfigEntity + +import topi from ..util import get_const_tuple -from ..nn.conv2d import _get_schedule, _get_workload -from ..nn.util import infer_pad, infer_stride +from ..nn.util import infer_pad from ..nn.pad import pad from .tensor_intrin import dot_16x1x16_int8_int8_int32 from .check_targets import check_skylake @@ -17,7 +19,6 @@ def _get_default_schedule(wkl, simd_width): HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 oc_bn = 1 @@ -41,14 +42,47 @@ def _get_default_schedule(wkl, simd_width): return AVXConvCommonFwd(ic_bn, oc_bn, reg_n, False) -def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): +def _fallback_schedule(wkl, simd_width): + batch_size, in_channel, height, width, _ = wkl[1] + out_channel, _, hkernel, wkernel, _ = wkl[2] + HPAD, WPAD = wkl[4] + HSTR, WSTR = wkl[3] + out_width = (width + 2 * WPAD - wkernel) // WSTR + 1 + + oc_bn = 1 + for bn in range(simd_width, 0, -1): + if out_channel % bn == 0: + oc_bn = bn + break + + ic_bn = 1 + for bn in range(oc_bn, 0, -1): + if in_channel % bn == 0: + ic_bn = bn + break + + reg_n = 1 + for n in range(31, 0, -1): + if out_width % n == 0: + reg_n = n + break + + cfg_dict = {"i": -1, + "c": None, + "e": [["tile_ic", "sp", [in_channel // ic_bn, ic_bn]], + ["tile_oc", "sp", [out_channel // oc_bn, oc_bn]], + ["tile_ow", "sp", [out_width // reg_n, reg_n]], + ["unroll_kw", "ot", False]], + "t": ""} + return ConfigEntity.from_json_dict(cfg_dict) + + +def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): out_dtype = data.dtype if out_dtype is None else out_dtype assert layout == 'NCHW', "only support NCHW convolution for AVX" - wkl = _get_workload(data, kernel, stride, padding, out_dtype) - sch = _get_schedule(wkl) - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride + HPAD, WPAD = padding + HSTR, WSTR = strides batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) @@ -66,20 +100,23 @@ def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): else: data_pad = data - shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn, pad_width) + # fetch schedule + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + + shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width) data_vec = tvm.compute(shape, - lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w], + lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w], name='data_vec') # pack kernel - shape = (num_filter//sch.oc_bn, in_channel//sch.ic_bn, - kernel_height, kernel_width, sch.ic_bn, sch.oc_bn) + shape = (num_filter//oc_bn, in_channel//ic_bn, + kernel_height, kernel_width, ic_bn, oc_bn) kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co: - kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w], + kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w], name='kernel_vec') # convolution - oshape = (batch_size, num_filter//sch.oc_bn, out_height, out_width, sch.oc_bn) + oshape = (batch_size, num_filter//oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, num_filter, out_height, out_width) ic = tvm.reduce_axis((0, in_channel), name='ic') @@ -87,32 +124,32 @@ def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR+kh, ic%sch.ic_bn, ow*WSTR+kw] - .astype(out_dtype) * - kernel_vec[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block] - .astype(out_dtype), - axis=[ic, kh, kw]), - name='conv') + tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR+kh, ic%ic_bn, + ow*WSTR+kw].astype(out_dtype) * + kernel_vec[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, + oc_block].astype(out_dtype), + axis=[ic, kh, kw]), name='conv') unpack = tvm.compute(unpack_shape, - lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn] + lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn] .astype(out_dtype), name='output_unpack', - tag='conv2d_nchw') + tag='conv2d_nchw', + attrs={'workload': + topi.x86.conv2d.conv_arg_to_workload(data, kernel, + strides, padding, + layout, out_dtype)}) return unpack -def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last): +def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): + # fetch schedule + ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) + # no stride and padding info here padding = infer_pad(data, data_pad) - if data_pad is None: - stride = infer_stride(data, kernel, output) - else: - stride = infer_stride(data_pad, kernel, output) - wkl = _get_workload(data, kernel, stride, padding, output.dtype) - sch = _get_schedule(wkl) - - HPAD, WPAD = wkl.hpad, wkl.wpad + HPAD, WPAD = padding DOPAD = (HPAD != 0 or WPAD != 0) A, W = data, kernel_vec @@ -128,7 +165,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou # schedule kernel pack oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) - if sch.oc_bn > 1: + if oc_bn > 1: s[W].vectorize(oc_block) parallel_axis = s[W].fuse(oc_chunk, oh) s[W].parallel(parallel_axis) @@ -138,7 +175,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou CC = s.cache_write(C, 'global') _, oc_chunk, oh, ow, oc_block = s[C].op.axis - ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n) + ow_chunk, ow_block = s[C].split(ow, factor=reg_n) s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) s[C].fuse(oc_chunk, oh) s[C].vectorize(oc_block) @@ -147,10 +184,10 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic, kh, kw = s[CC].op.reduce_axis - ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) - ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) + ow_chunk, ow_block = s[CC].split(ow, factor=reg_n) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) - if sch.unroll_kw: + if unroll_kw: s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) s[CC].unroll(kw) else: @@ -164,8 +201,8 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou s[O0].compute_inline() batch, oc, oh, ow = s[O].op.axis - ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n) - oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn) + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) parallel_axis = s[O].fuse(oc_chunk, oh) s[C].compute_at(s[O], parallel_axis) @@ -176,39 +213,62 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou return s -def _declaration_conv_NCHWc(wkl, sch, data, kernel): - out_dtype = wkl.out_dtype - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride +def _declaration_conv_NCHWc(cfg, data, kernel, kernel_size, strides, padding, layout, + out_layout, out_dtype): + HPAD, WPAD = padding + HSTR, WSTR = strides - batch_size = data.shape[0] - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + n, ic_chunk, ih, iw, ic_block = get_const_tuple(data.shape) + ic = ic_chunk * ic_block + kh, kw = kernel_size + oc_chunk, _, _, _, _, oc_block = get_const_tuple(kernel.shape) + oc = oc_chunk * oc_block + oh = (ih + 2 * HPAD - kh) // HSTR + 1 + ow = (iw + 2 * WPAD - kw) // WSTR + 1 - # pack data + # DOPAD DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: data_pad = data - # convolution - oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn) - - ic = tvm.reduce_axis((0, wkl.in_filter), name='ic') - kh = tvm.reduce_axis((0, wkl.hkernel), name='kh') - kw = tvm.reduce_axis((0, wkl.wkernel), name='kw') + # fetch schedule + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + if ic_bn != ic_block: + raise RuntimeError("ic_bn in config is not equal to actual data ic_block: %d vs %d." + % (ic_bn, ic_block)) + if oc_bn != oc_block: + raise RuntimeError("oc_bn in config is not equal to actual kernel oc_block: %d vs %d." + % (oc_bn, oc_block)) + # convolution + oshape = (n, oc//oc_bn, oh, ow, oc_bn) + + ic = tvm.reduce_axis((0, ic), name='ic') + kh = tvm.reduce_axis((0, kernel_size[0]), name='kh') + kw = tvm.reduce_axis((0, kernel_size[1]), name='kw') + + workload = topi.x86.conv2d.conv_NCHWc_arg_to_workload(data, kernel, + kernel_size, + strides, padding, + layout, out_layout, + out_dtype), + attrs = {'workload': workload} conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR+kh, ow*WSTR+kw, ic%sch.ic_bn] - .astype(out_dtype) * - kernel[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block], - axis=[ic, kh, kw]), name='conv2d_NCHWc', tag="conv2d_NCHWc") - + tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw, + ic%ic_bn].astype(out_dtype) * + kernel[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, oc_block], + axis=[ic, kh, kw]), + name='conv2d_NCHWc', tag="conv2d_NCHWc", attrs=attrs) return conv -def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): +def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): + # fetch schedule + ic_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_ow"].size[-1], + cfg["unroll_kw"].val) + # schedule data A = data if isinstance(s[A].op, tvm.tensor.ComputeOp): @@ -221,7 +281,7 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): CC = s.cache_write(C, 'global') _, oc_chunk, oh, ow, oc_block = s[C].op.axis - ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n) + ow_chunk, ow_block = s[C].split(ow, factor=reg_n) s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) parallel_axis = s[C].fuse(oc_chunk, oh) s[C].vectorize(oc_block) @@ -232,10 +292,10 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic, kh, kw = s[CC].op.reduce_axis - ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) - ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn) + ow_chunk, ow_block = s[CC].split(ow, factor=reg_n) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) - if sch.unroll_kw: + if unroll_kw: s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) s[CC].unroll(kw) else: @@ -246,7 +306,7 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): if C != O: batch, oc_chunk, oh, ow, oc_block = s[O].op.axis - ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n) + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) parallel_axis = s[O].fuse(oc_chunk, oh) s[C].compute_at(s[O], parallel_axis) diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py new file mode 100644 index 000000000000..406b2c0edde0 --- /dev/null +++ b/tutorials/autotvm/tune_nnvm_x86.py @@ -0,0 +1,192 @@ +""" +Auto-tuning a convolutional network for x86 CPU +==================================================== +**Author**: `Yao Wang `_ + +This is a tutorial about how to tune convolution neural network +for x86 cpu. +""" + +import numpy as np + +import nnvm.testing +import nnvm.compiler +import tvm +from tvm import autotvm +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from topi.x86.conv2d import conv_NCHWc_arg_to_workload +import tvm.contrib.graph_runtime as runtime + +################################################################# +# Define network +# -------------- +# First we need to define the network in nnvm symbol API. +# We can load some pre-defined network from :code:`nnvm.testing`. +# We can also load models from MXNet, ONNX and TensorFlow (see NNVM +# tutorials :ref:`tutorial-nnvm` for more details). +# +# In this tutorial, we choose resnet-18 as tuning example. + +def get_network(name, batch_size): + """Get the symbol definition and random weight of a network""" + input_shape = (batch_size, 3, 224, 224) + output_shape = (batch_size, 1000) + + if "resnet" in name: + n_layer = int(name.split('-')[1]) + net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size) + elif "vgg" in name: + n_layer = int(name.split('-')[1]) + net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size) + elif name == 'mobilenet': + net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size) + elif name == 'squeezenet_v1.1': + net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1') + elif name == 'inception_v3': + input_shape = (1, 3, 299, 299) + net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size) + elif name == 'custom': + # an example for custom network + from nnvm.testing import utils + net = nnvm.sym.Variable('data') + net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1)) + net = nnvm.sym.flatten(net) + net = nnvm.sym.dense(net, units=1000) + net, params = utils.create_workload(net, batch_size, (3, 224, 224)) + elif name == 'mxnet': + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + block = get_model('resnet18_v1', pretrained=True) + net, params = nnvm.frontend.from_mxnet(block) + net = nnvm.sym.softmax(net) + else: + raise ValueError("Unsupported network: " + name) + + return net, params, input_shape, output_shape + +# Replace "llvm" with the correct target of your cpu. +# For example, for AWS EC2 c5 instance with Intel Xeon +# Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512". +# For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be +# "llvm -mcpu=core-avx2". +target = "llvm" + +batch_size = 1 +dtype = "float32" +model_name = "resnet-18" +log_file = "%s.log" % model_name + +# Set number of threads used for tuning based on the number of +# physical cpu cores on your machine. +num_threads = 1 + + +################################################################# +# Configure tensor tuning settings and create tasks +# ------------------------------------------------- +# To get better kernel execution performance on x86 cpu, +# we need to change data layout of convolution kernel from +# "NCHW" to "NCHWc". To deal with this situation, we define +# conv2d_NCHWc operator in topi. We will tune this operator +# instead of plain conv2d. +# +# We will use local mode for tuning configuration. RPC tracker +# mode can be setup similarly to the approach in autotvm +# arm_cpu tutorial. + +tuning_option = { + 'log_filename': log_file, + 'tuner': 'gridsearch', + 'early_stopping': None, + + 'measure_option': autotvm.measure_option( + builder=autotvm.LocalBuilder(n_parallel=1), + runner=autotvm.LocalRunner(number=10, repeat=1, + min_repeat_ms=1000), + ), +} + +# You can skip the implementation of this function for this tutorial. +def tune_kernels(tasks, + measure_option, + tuner='gridsearch', + early_stopping=None, + log_filename='tuning.log'): + + for i, tsk in enumerate(tasks): + prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) + + # converting conv2d tasks to conv2d_NCHWc tasks + data, kernel, strides, padding, layout, dtype = tsk.args + kernel_size = (kernel[1][2], kernel[1][3]) + data_plc = tvm.placeholder(data[1], name="data") + kernel_plc = tvm.placeholder(kernel[1], name="kernel") + args = [data_plc, kernel_plc, data[1][1], kernel_size, strides, + padding, layout, layout, dtype] + args = autotvm.task.nnvm_integration.serialize_args(args) + task = autotvm.task.create("topi_x86_conv2d_NCHWc", args=args, target=target) + task.workload = conv_NCHWc_arg_to_workload(data_plc, kernel_plc, kernel_size, + strides, padding, layout, dtype) + + # create tuner + if tuner == 'xgb' or tuner == 'xgb-rank': + tuner_obj = XGBTuner(task, loss_type='rank') + elif tuner == 'ga': + tuner_obj = GATuner(task, pop_size=50) + elif tuner == 'random': + tuner_obj = RandomTuner(task) + elif tuner == 'gridsearch': + tuner_obj = GridSearchTuner(task) + else: + raise ValueError("Invalid tuner: " + tuner) + + # do tuning + n_trial=len(task.config_space) + tuner_obj.tune(n_trial=n_trial, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(n_trial, prefix=prefix), + autotvm.callback.log_to_file(log_filename)]) + + +######################################################################## +# Finally, we launch tuning jobs and evaluate the end-to-end performance. + +def tune_and_evaluate(tuning_opt): + # extract workloads from nnvm graph + print("Extract tasks...") + net, params, data_shape, out_shape = get_network(model_name, batch_size) + tasks = autotvm.task.extract_from_graph(net, target=target, + shape={'data': data_shape}, dtype=dtype, + symbols=(nnvm.sym.conv2d,)) + + # run tuning tasks + print("Tuning...") + tune_kernels(tasks, **tuning_opt) + + # compile kernels with history best records + with autotvm.apply_history_best(log_file): + print("Compile...") + with nnvm.compiler.build_config(opt_level=3): + graph, lib, params = nnvm.compiler.build( + net, target=target, shape={'data': data_shape}, params=params, dtype=dtype) + + # upload parameters to device + ctx = tvm.cpu() + data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) + module = runtime.create(graph, lib, ctx) + module.set_input('data', data_tvm) + module.set_input(**params) + + # evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) + prof_res = np.array(ftimer().results) * 1000 # convert to millisecond + print("Mean inference time (std dev): %.2f ms (%.2f ms)" % + (np.mean(prof_res), np.std(prof_res))) + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. + +# tune_and_evaluate(tuning_option) From f2a923ae6b7d11b2f8963ba2795c9a9975cbfda9 Mon Sep 17 00:00:00 2001 From: Wang Date: Tue, 25 Sep 2018 13:38:58 -0700 Subject: [PATCH 02/13] Add ApplyGraphBest dispatch context --- python/tvm/autotvm/__init__.py | 3 +- python/tvm/autotvm/task/__init__.py | 2 +- python/tvm/autotvm/task/dispatcher.py | 80 +++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py index 7170dbdd8565..08cfbb2a95da 100644 --- a/python/tvm/autotvm/__init__.py +++ b/python/tvm/autotvm/__init__.py @@ -27,5 +27,6 @@ from .tuner import callback from .task import template, get_config, create, ConfigSpace, ConfigEntity, \ register_topi_compute, register_topi_schedule, \ - DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best + DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best, \ + ApplyGraphBest as apply_graph_best from .env import GLOBAL_SCOPE diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py index 8efb0e61b518..04bcec92fd57 100644 --- a/python/tvm/autotvm/task/__init__.py +++ b/python/tvm/autotvm/task/__init__.py @@ -10,7 +10,7 @@ from .space import ConfigSpace, ConfigEntity from .code_hash import attach_code_hash, attach_code_hash_to_arg from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \ - FallbackContext, clear_fallback_cache + FallbackContext, clear_fallback_cache, ApplyGraphBest from .topi_integration import register_topi_compute, register_topi_schedule from .nnvm_integration import extract_from_graph, extract_from_multiple_graph diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index 8e159cc412c9..164877e3b451 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -345,3 +345,83 @@ def clear_fallback_cache(target, workload): while not isinstance(context, FallbackContext): context = context._old_ctx context.clear_cache(target, workload) + +class ApplyGraphBest(DispatchContext): + """Load the graph level tuning optimal schedules. + + The input records should be in the ascending order of + node index for target operator. Usually this can be obtained + with graph tuner. + + This context maintains an internal counter to indicate the current + node index. + """ + def __init__(self, records): + """ + Parameters + ---------- + records : str or iterator of (MeasureInput, MeasureResult) + Collection of tuning records. + If is str, then it should be the filename of a records log file. + Each row of this file is an encoded record pair. + Otherwise, it is an iterator. + """ + from ..record import load_from_file + + super(ApplyGraphBest, self).__init__() + if isinstance(records, str): + records = load_from_file(records) + self._records = list(records) + self._counter = 0 + self._global_cfg_dict = {} + + def _query_inside(self, target, workload): + """ + Query the context to get config from records. + + Parameters + ---------- + target : Target + The current target + workload : Workload + The current workload. + + Returns + ------- + cfg : ConfigSpace + The specific configuration. + """ + cfg = self._records[self._counter][0].config + self._counter += 1 + return cfg + + def query_global_dict(self, key): + """ + Query the context to get config from global + config dictionary. + + Parameters + ---------- + key : str + Key to query the config. + + Returns + ------- + cfg : ConfigSpace + The specific configuration. + """ + return self._global_cfg_dict[key] + + def update_global_dict(self, key, val): + """ + Update the global config dictionary. + + Parameters + ---------- + key : str + Key of config. + + val : ConfigSpace + Value of config. + """ + self._global_cfg_dict[key] = val From bc11542c3193742bf3149a47de2b175bc3ae15ee Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 25 Sep 2018 20:57:06 +0000 Subject: [PATCH 03/13] Fix tutorial --- topi/python/topi/x86/conv2d.py | 49 ++++++++++++++++++++++++++++++ tutorials/autotvm/tune_nnvm_x86.py | 4 +-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index f20494d5c0ff..bf6d8f5cb442 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -3,6 +3,8 @@ import tvm from tvm import autotvm from tvm.autotvm.task.dispatcher import ApplyGraphBest +from tvm.autotvm.task.nnvm_integration import deserialize_args +from tvm.autotvm.task import register, get_config from .. import generic, tag from .. import nn from ..util import get_const_tuple @@ -359,6 +361,53 @@ def traverse(op): return s +# Define template function for autotvm task +# We define schedule template in this function instead of +# declaration function since actual input arguments need +# to be altered by the schedule selected. +@register("topi_x86_conv2d_NCHWc") +def _topi_nn_conv2d_NCHWc(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + data, kernel = args[:2] + kernel_size = args[3] + strides = args[4] + padding = args[5] + layout = args[6] + kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \ + (kernel_size, kernel_size) + is_kernel_1x1 = kh == 1 and kw == 1 + raw_data_shape = get_const_tuple(data.shape) + raw_kernel_shape = get_const_tuple(kernel.shape) + + # get config here + cfg = get_config() + _create_schedule_template(cfg, data, kernel, strides, padding, layout) + + # change shape with the value in config + ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1]) + new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn, + raw_data_shape[2], raw_data_shape[3], ic_bn) + data_layout = "NCHW%dc" % ic_bn + out_layout = "NCHW%dc" % oc_bn + if is_kernel_1x1: + new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, + ic_bn, oc_bn, raw_kernel_shape[2], raw_kernel_shape[3]) + else: + new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, + raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn) + args[0] = tvm.placeholder(new_data_shape, data.dtype) + args[1] = tvm.placeholder(new_kernel_shape, kernel.dtype) + args[6] = data_layout + args[7] = out_layout + + C = _declaration_conv_NCHWc(cfg, *args, **kwargs) + s = _schedule_conv2d_NCHWc(cfg, args[2], args[3], args[4], args[5], + args[6], args[7], [C]) + return s, [args[0], args[1], C] + + def conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, padding, layout, out_layout, out_dtype): """convert argument to workload""" diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py index 406b2c0edde0..62ff0a584bb8 100644 --- a/tutorials/autotvm/tune_nnvm_x86.py +++ b/tutorials/autotvm/tune_nnvm_x86.py @@ -126,7 +126,7 @@ def tune_kernels(tasks, args = autotvm.task.nnvm_integration.serialize_args(args) task = autotvm.task.create("topi_x86_conv2d_NCHWc", args=args, target=target) task.workload = conv_NCHWc_arg_to_workload(data_plc, kernel_plc, kernel_size, - strides, padding, layout, dtype) + strides, padding, layout, layout, dtype) # create tuner if tuner == 'xgb' or tuner == 'xgb-rank': @@ -189,4 +189,4 @@ def tune_and_evaluate(tuning_opt): # We do not run the tuning in our webpage server since it takes too long. # Uncomment the following line to run it by yourself. -# tune_and_evaluate(tuning_option) +#tune_and_evaluate(tuning_option) From a65884d33da420ce41fd11cc7dec4b6344a4ac60 Mon Sep 17 00:00:00 2001 From: Wang Date: Fri, 28 Sep 2018 13:48:07 -0700 Subject: [PATCH 04/13] Fix conv2d --- topi/python/topi/x86/conv2d.py | 9 +++++---- tutorials/autotvm/tune_nnvm_x86.py | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index bf6d8f5cb442..142193c42bda 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -246,6 +246,8 @@ def conv2d_x86(data, kernel, strides, padding, layout, out_dtype): @conv2d_x86.register(["direct"]) def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): out_dtype = data.dtype if out_dtype is None else out_dtype + padding = padding if isinstance(padding, (tuple, list)) else (padding, padding) + strides = strides if isinstance(strides, (tuple, list)) else (strides, strides) if layout == 'NCHW': _create_schedule_template(cfg, data, kernel, strides, padding, layout) args = [cfg, data, kernel, strides, padding, layout, out_dtype] @@ -476,10 +478,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfo): if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1: return None - import ast - kernel_size = ast.literal_eval(attrs["kernel_size"]) - padding = ast.literal_eval(attrs["padding"]) - strides = ast.literal_eval(attrs["strides"]) + kernel_size = attrs.get_int_tuple("kernel_size") + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") layout = attrs['layout'] out_layout = layout if attrs["out_layout"] == "__undef__" else attrs["out_layout"] diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py index 62ff0a584bb8..fb8653bfdabd 100644 --- a/tutorials/autotvm/tune_nnvm_x86.py +++ b/tutorials/autotvm/tune_nnvm_x86.py @@ -6,7 +6,7 @@ This is a tutorial about how to tune convolution neural network for x86 cpu. """ - +import os import numpy as np import nnvm.testing @@ -79,6 +79,7 @@ def get_network(name, batch_size): # Set number of threads used for tuning based on the number of # physical cpu cores on your machine. num_threads = 1 +os.environ["TVM_NUM_THREADS"] = num_threads ################################################################# @@ -100,7 +101,7 @@ def get_network(name, batch_size): 'early_stopping': None, 'measure_option': autotvm.measure_option( - builder=autotvm.LocalBuilder(n_parallel=1), + builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), ), From 9f405ef429e044454810b29ea34a9db31a110abc Mon Sep 17 00:00:00 2001 From: Wang Date: Sun, 30 Sep 2018 14:20:34 -0700 Subject: [PATCH 05/13] Improve tutorial --- tutorials/autotvm/tune_nnvm_x86.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py index fb8653bfdabd..a8bb740fa07e 100644 --- a/tutorials/autotvm/tune_nnvm_x86.py +++ b/tutorials/autotvm/tune_nnvm_x86.py @@ -79,7 +79,7 @@ def get_network(name, batch_size): # Set number of threads used for tuning based on the number of # physical cpu cores on your machine. num_threads = 1 -os.environ["TVM_NUM_THREADS"] = num_threads +os.environ["TVM_NUM_THREADS"] = str(num_threads) ################################################################# @@ -191,3 +191,30 @@ def tune_and_evaluate(tuning_opt): # Uncomment the following line to run it by yourself. #tune_and_evaluate(tuning_option) + +###################################################################### +# Sample Output +# ------------- +# The tuning needs to compile many programs and extract feature from them. +# So a high performance CPU is recommended. +# One sample output is listed below. +# +# .. code-block:: bash +# +# Extract tasks... +# Tuning... +# [Task 1/12] Current/Best: 598.05/2497.63 GFLOPS | Progress: (252/252) | 1357.95 s Done. +# [Task 2/12] Current/Best: 522.63/2279.24 GFLOPS | Progress: (784/784) | 3989.60 s Done. +# [Task 3/12] Current/Best: 447.33/1927.69 GFLOPS | Progress: (784/784) | 3869.14 s Done. +# [Task 4/12] Current/Best: 481.11/1912.34 GFLOPS | Progress: (672/672) | 3274.25 s Done. +# [Task 5/12] Current/Best: 414.09/1598.45 GFLOPS | Progress: (672/672) | 2720.78 s Done. +# [Task 6/12] Current/Best: 508.96/2273.20 GFLOPS | Progress: (768/768) | 3718.75 s Done. +# [Task 7/12] Current/Best: 469.14/1955.79 GFLOPS | Progress: (576/576) | 2665.67 s Done. +# [Task 8/12] Current/Best: 230.91/1658.97 GFLOPS | Progress: (576/576) | 2435.01 s Done. +# [Task 9/12] Current/Best: 487.75/2295.19 GFLOPS | Progress: (648/648) | 3009.95 s Done. +# [Task 10/12] Current/Best: 182.33/1734.45 GFLOPS | Progress: (360/360) | 1755.06 s Done. +# [Task 11/12] Current/Best: 372.18/1745.15 GFLOPS | Progress: (360/360) | 1684.50 s Done. +# [Task 12/12] Current/Best: 215.34/2271.11 GFLOPS | Progress: (400/400) | 2128.74 s Done. +# Compile... +# Evaluate inference time cost... +# Mean inference time (std dev): 3.16 ms (0.03 ms) From fbde12f502a082c9d40e893e81f819ed223f4fb4 Mon Sep 17 00:00:00 2001 From: Wang Date: Sun, 30 Sep 2018 15:37:59 -0700 Subject: [PATCH 06/13] Fix default schedule --- topi/python/topi/x86/conv2d.py | 50 ++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 142193c42bda..81acc093c0fd 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -198,6 +198,28 @@ def _get_schedule_NCHWc_x86_int8(wkl, layout, out_layout): def _get_alter_layout_schedule_x86(wkl): return _get_schedule_conv(wkl) + +def _get_fp32_len(): + fp32_vec_len = 8 + target = tvm.target.current_target() + if target is not None: + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + fp32_vec_len = 16 + return fp32_vec_len + + +def _get_default_sch(workload): + fp32_vec_len = _get_fp32_len() + _, _, kh, kw, _ = workload[2] + is_kernel_1x1 = kh == 1 and kw == 1 + if is_kernel_1x1: + cfg = conv2d_avx_1x1._fallback_schedule(workload, fp32_vec_len) + else: + cfg = conv2d_avx_common._fallback_schedule(workload, fp32_vec_len) + return cfg + + def _create_schedule_template(cfg, data, kernel, strides, padding, layout): """Create schedule configuration from input arguments""" dshape = get_const_tuple(data.shape) @@ -250,6 +272,10 @@ def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): strides = strides if isinstance(strides, (tuple, list)) else (strides, strides) if layout == 'NCHW': _create_schedule_template(cfg, data, kernel, strides, padding, layout) + if cfg.is_fallback: + workload = conv_arg_to_workload(data, kernel, strides, padding, + layout, out_dtype) + cfg = _get_default_sch(workload) args = [cfg, data, kernel, strides, padding, layout, out_dtype] _, _, kh, kw = get_const_tuple(kernel.shape) is_kernel_1x1 = kh == 1 and kw == 1 @@ -295,7 +321,11 @@ def traverse(op): _, _, kh, kw = get_const_tuple(kernel.shape) is_kernel_1x1 = kh == 1 and kw == 1 - args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, + current_cfg = cfg + if cfg.is_fallback: + workload = op.attrs["workload"] + current_cfg = _get_default_sch(workload) + args = [s, current_cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] if is_kernel_1x1: conv2d_avx_1x1._schedule_conv(*args) @@ -437,16 +467,6 @@ def conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, out_dtype]) -def _get_fp32_len(): - fp32_vec_len = 8 - target = tvm.target.current_target() - if target is not None: - for opt in target.options: - if opt == '-mcpu=skylake-avx512': - fp32_vec_len = 16 - return fp32_vec_len - - def _query_dispatcher(workload, in_alter_op=False): dispatch_ctx = autotvm.task.DispatchContext.current if isinstance(dispatch_ctx, ApplyGraphBest): @@ -458,13 +478,7 @@ def _query_dispatcher(workload, in_alter_op=False): target = tvm.target.current_target() cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: - fp32_vec_len = _get_fp32_len() - _, _, kh, kw, _ = workload[2] - is_kernel_1x1 = kh == 1 and kw == 1 - if is_kernel_1x1: - cfg = conv2d_avx_1x1._fallback_schedule(workload, fp32_vec_len) - else: - cfg = conv2d_avx_common._fallback_schedule(workload, fp32_vec_len) + cfg = _get_default_sch(workload) return cfg From 3c7e7ab5f258ff9e34b5e257953d66cdfc0b0cbb Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 30 Sep 2018 23:45:34 +0000 Subject: [PATCH 07/13] Fix 1x1 default schedule loading --- topi/python/topi/x86/conv2d.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 81acc093c0fd..65200cd97a73 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -323,7 +323,13 @@ def traverse(op): is_kernel_1x1 = kh == 1 and kw == 1 current_cfg = cfg if cfg.is_fallback: - workload = op.attrs["workload"] + workload_attr = op.attrs["workload"] + strides = (workload_attr[3][0].value, workload_attr[3][1].value) + padding = (workload_attr[4][0].value, workload_attr[4][1].value) + layout = workload_attr[5].value + out_dtype = workload_attr[6].value + workload = conv_arg_to_workload(data, kernel, strides, padding, + layout, out_dtype) current_cfg = _get_default_sch(workload) args = [s, current_cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] From 30ad5b6a8ddca022accf588912ecf97d71704520 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 1 Oct 2018 00:13:39 +0000 Subject: [PATCH 08/13] Fix workload type --- topi/python/topi/x86/conv2d.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 65200cd97a73..f8d170e3e516 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -324,8 +324,8 @@ def traverse(op): current_cfg = cfg if cfg.is_fallback: workload_attr = op.attrs["workload"] - strides = (workload_attr[3][0].value, workload_attr[3][1].value) - padding = (workload_attr[4][0].value, workload_attr[4][1].value) + strides = (int(workload_attr[3][0].value), int(workload_attr[3][1].value)) + padding = (int(workload_attr[4][0].value), int(workload_attr[4][1].value)) layout = workload_attr[5].value out_dtype = workload_attr[6].value workload = conv_arg_to_workload(data, kernel, strides, padding, From 2a609089a7af3b7cae5313a2081937020c8ec064 Mon Sep 17 00:00:00 2001 From: Wang Date: Mon, 8 Oct 2018 14:33:10 -0700 Subject: [PATCH 09/13] Change gridsearch to random --- tutorials/autotvm/tune_nnvm_x86.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py index a8bb740fa07e..ec7dcab634eb 100644 --- a/tutorials/autotvm/tune_nnvm_x86.py +++ b/tutorials/autotvm/tune_nnvm_x86.py @@ -97,7 +97,7 @@ def get_network(name, batch_size): tuning_option = { 'log_filename': log_file, - 'tuner': 'gridsearch', + 'tuner': 'random', 'early_stopping': None, 'measure_option': autotvm.measure_option( @@ -190,7 +190,7 @@ def tune_and_evaluate(tuning_opt): # We do not run the tuning in our webpage server since it takes too long. # Uncomment the following line to run it by yourself. -#tune_and_evaluate(tuning_option) +# tune_and_evaluate(tuning_option) ###################################################################### # Sample Output From c63aeb63bfc34a49674183371401a443c1b5e5b1 Mon Sep 17 00:00:00 2001 From: Wang Date: Wed, 10 Oct 2018 10:57:13 -0700 Subject: [PATCH 10/13] Add reference to autotvm arm --- tutorials/autotvm/tune_nnvm_x86.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py index ec7dcab634eb..ddd91f584c08 100644 --- a/tutorials/autotvm/tune_nnvm_x86.py +++ b/tutorials/autotvm/tune_nnvm_x86.py @@ -92,8 +92,8 @@ def get_network(name, batch_size): # instead of plain conv2d. # # We will use local mode for tuning configuration. RPC tracker -# mode can be setup similarly to the approach in autotvm -# arm_cpu tutorial. +# mode can be setup similarly to the approach in +# :ref:`tune_nnvm_arm` tutorial. tuning_option = { 'log_filename': log_file, From 9c0ee50e8635aaafa27698b267cf81001bd18ade Mon Sep 17 00:00:00 2001 From: Wang Date: Tue, 16 Oct 2018 12:03:41 -0700 Subject: [PATCH 11/13] Merge conv2d common and 1x1 decl --- topi/python/topi/x86/conv2d.py | 159 +++++++++++++++++----- topi/python/topi/x86/conv2d_avx_1x1.py | 100 +------------- topi/python/topi/x86/conv2d_avx_common.py | 116 ---------------- 3 files changed, 128 insertions(+), 247 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index f8d170e3e516..460befd80f0b 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -11,6 +11,7 @@ from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \ _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \ _get_schedule_NCHWc_int8, _get_alter_layout_schedule, Workload +from ..nn.pad import pad from . import conv2d_avx_1x1, conv2d_avx_common from .conv2d_avx_common import AVXConvCommonFwd @@ -277,10 +278,7 @@ def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): layout, out_dtype) cfg = _get_default_sch(workload) args = [cfg, data, kernel, strides, padding, layout, out_dtype] - _, _, kh, kw = get_const_tuple(kernel.shape) - is_kernel_1x1 = kh == 1 and kw == 1 - return conv2d_avx_1x1._declaration_conv(*args) if is_kernel_1x1 else \ - conv2d_avx_common._declaration_conv(*args) + return _declaration_conv_impl(*args) elif layout == 'HWCN': return nn.conv2d_hwcn(data, kernel, strides, padding, out_dtype) elif layout == 'NHWC': @@ -289,6 +287,72 @@ def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): raise ValueError("not support this layout {} yet".format(layout)) +def _declaration_conv_impl(cfg, data, kernel, strides, padding, layout, out_dtype): + out_dtype = data.dtype if out_dtype is None else out_dtype + assert layout == 'NCHW', "only support NCHW convolution for AVX" + + HPAD, WPAD = padding + HSTR, WSTR = strides + + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) + + pad_height = in_height + 2 * HPAD + pad_width = in_width + 2 * WPAD + + out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 + out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 + + # pack data + DOPAD = (HPAD != 0 or WPAD != 0) + if DOPAD: + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + else: + data_pad = data + + # fetch schedule + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + + shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width) + data_vec = tvm.compute(shape, + lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w], + name='data_vec') + + # pack kernel + shape = (num_filter//oc_bn, in_channel//ic_bn, + kernel_height, kernel_width, ic_bn, oc_bn) + kernel_vec = tvm.compute(shape, + lambda CO, CI, h, w, ci, co: + kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w], + name='kernel_vec') + + # convolution + oshape = (batch_size, num_filter//oc_bn, out_height, out_width, oc_bn) + unpack_shape = (batch_size, num_filter, out_height, out_width) + + ic = tvm.reduce_axis((0, in_channel), name='ic') + kh = tvm.reduce_axis((0, kernel_height), name='kh') + kw = tvm.reduce_axis((0, kernel_width), name='kw') + + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR+kh, ic%ic_bn, + ow*WSTR+kw].astype(out_dtype) * + kernel_vec[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, + oc_block].astype(out_dtype), + axis=[ic, kh, kw]), name='conv') + + unpack = tvm.compute(unpack_shape, + lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn] + .astype(out_dtype), + name='output_unpack', + tag='conv2d_nchw', + attrs={'workload': + conv_arg_to_workload(data, kernel, strides, + padding, layout, + out_dtype)}) + return unpack + + @autotvm.task.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct']) def schedule_conv2d(cfg, outs): """Create schedule for tensors""" @@ -408,13 +472,9 @@ def _topi_nn_conv2d_NCHWc(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" args = deserialize_args(args) data, kernel = args[:2] - kernel_size = args[3] strides = args[4] padding = args[5] layout = args[6] - kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \ - (kernel_size, kernel_size) - is_kernel_1x1 = kh == 1 and kw == 1 raw_data_shape = get_const_tuple(data.shape) raw_kernel_shape = get_const_tuple(kernel.shape) @@ -429,12 +489,8 @@ def _topi_nn_conv2d_NCHWc(*args, **kwargs): raw_data_shape[2], raw_data_shape[3], ic_bn) data_layout = "NCHW%dc" % ic_bn out_layout = "NCHW%dc" % oc_bn - if is_kernel_1x1: - new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, - ic_bn, oc_bn, raw_kernel_shape[2], raw_kernel_shape[3]) - else: - new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, - raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn) + new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, + raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn) args[0] = tvm.placeholder(new_data_shape, data.dtype) args[1] = tvm.placeholder(new_kernel_shape, kernel.dtype) args[6] = data_layout @@ -451,21 +507,14 @@ def conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, """convert argument to workload""" dshape = get_const_tuple(data.shape) kshape = get_const_tuple(kernel.shape) - kh, kw = kernel_size if isinstance(kernel_size, (tuple, list)) else \ - (kernel_size, kernel_size) - is_kernel_1x1 = kh == 1 and kw == 1 if len(dshape) > 4: raw_data = tvm.placeholder((dshape[0], dshape[1] * dshape[4], dshape[2], dshape[3]), dtype=kernel.dtype) else: raw_data = data if len(kshape) > 4: - if is_kernel_1x1: - raw_kernel = tvm.placeholder((kshape[0] * kshape[3], kshape[1] * kshape[2], - kshape[4], kshape[5]), dtype=kernel.dtype) - else: - raw_kernel = tvm.placeholder((kshape[0] * kshape[5], kshape[1] * kshape[4], - kshape[2], kshape[3]), dtype=kernel.dtype) + raw_kernel = tvm.placeholder((kshape[0] * kshape[5], kshape[1] * kshape[4], + kshape[2], kshape[3]), dtype=kernel.dtype) else: raw_kernel = kernel return ('conv2d_NCHWc', ) + autotvm.task.args_to_workload( @@ -506,8 +555,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfo): dtype = data.dtype out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"] - kh, kw = kernel_size - is_kernel_1x1 = kh == 1 and kw == 1 workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, strides, padding, layout, out_layout, out_dtype) cfg = _query_dispatcher(workload, True) @@ -524,12 +571,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfo): global_dict_key = workload dispatch_ctx.update_global_dict(global_dict_key, cfg) - if is_kernel_1x1: - # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) - new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn) - else: - # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) - new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) + new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) @@ -569,8 +612,58 @@ def _declaration_conv_NCHWc(cfg, data, kernel, num_filter, kernel_size, strides, else conv2d_avx_common._declaration_conv_NCHWc_int8(wkl, sch, data, kernel) args = [cfg, data, kernel, (kh, kw), (sh, sw), (ph, pw), layout, out_layout, out_dtype] - return conv2d_avx_1x1._declaration_conv_NCHWc(*args) if is_kernel_1x1 else \ - conv2d_avx_common._declaration_conv_NCHWc(*args) + return _declaration_conv_NCHWc_impl(*args) + + +def _declaration_conv_NCHWc_impl(cfg, data, kernel, kernel_size, strides, padding, layout, + out_layout, out_dtype): + HPAD, WPAD = padding + HSTR, WSTR = strides + + n, ic_chunk, ih, iw, ic_block = get_const_tuple(data.shape) + ic = ic_chunk * ic_block + kh, kw = kernel_size + oc_chunk, _, _, _, _, oc_block = get_const_tuple(kernel.shape) + oc = oc_chunk * oc_block + oh = (ih + 2 * HPAD - kh) // HSTR + 1 + ow = (iw + 2 * WPAD - kw) // WSTR + 1 + + # DOPAD + DOPAD = (HPAD != 0 or WPAD != 0) + if DOPAD: + data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") + else: + data_pad = data + + # fetch schedule + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + if ic_bn != ic_block: + raise RuntimeError("ic_bn in config is not equal to actual data ic_block: %d vs %d." + % (ic_bn, ic_block)) + if oc_bn != oc_block: + raise RuntimeError("oc_bn in config is not equal to actual kernel oc_block: %d vs %d." + % (oc_bn, oc_block)) + + # convolution + oshape = (n, oc//oc_bn, oh, ow, oc_bn) + + ic = tvm.reduce_axis((0, ic), name='ic') + kh = tvm.reduce_axis((0, kernel_size[0]), name='kh') + kw = tvm.reduce_axis((0, kernel_size[1]), name='kw') + + workload = conv2d.conv_NCHWc_arg_to_workload(data, kernel, + kernel_size, + strides, padding, + layout, out_layout, + out_dtype), + attrs = {'workload': workload} + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw, + ic%ic_bn].astype(out_dtype) * + kernel[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, oc_block], + axis=[ic, kh, kw]), + name='conv2d_NCHWc', tag="conv2d_NCHWc", attrs=attrs) + return conv @generic.schedule_conv2d_NCHWc.register("cpu") diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 10c57dc3fd46..b3b0166ae111 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -80,56 +80,6 @@ def _fallback_schedule(wkl, simd_width): raise ValueError("cannot decide default schedule for workload: {}".format(wkl)) -def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): - assert layout == 'NCHW', "only support NCHW convolution for AVX" - - HPAD, WPAD = padding - HSTR, WSTR = strides - - batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) - num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) - - pad_height = in_height + 2 * HPAD - pad_width = in_width + 2 * WPAD - - out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 - out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 - - DOPAD = (HPAD != 0 or WPAD != 0) - if DOPAD: - data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") - else: - data_pad = data - - # fetch schedule - ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - - shape = (batch_size, in_channel // ic_bn, pad_height, pad_width, ic_bn) - data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, C * ic_bn + c, h, w]) - - shape = (num_filter // oc_bn, in_channel // ic_bn, ic_bn, oc_bn, 1, 1) - kernel_vec = tvm.compute(shape, lambda CO, CI, ci, co, h, w: - kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w], - name='kernel_vec') - - oshape = (batch_size, num_filter // oc_bn, out_height, out_width, oc_bn) - ic = tvm.reduce_axis((0, in_channel), name='ic') - conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR, ow*WSTR, ic%ic_bn] * - kernel_vec[oc_chunk, ic//ic_bn, ic%ic_bn, oc_block, 0, 0], - axis=[ic]), name='conv') - - oshape = (batch_size, num_filter, out_height, out_width) - unpack = tvm.compute(oshape, lambda n, oc, oh, ow: - conv[n, oc // oc_bn, oh, ow, oc % oc_bn], - tag='conv2d_nchw', - attrs={'workload': - topi.x86.conv2d.conv_arg_to_workload(data, kernel, - strides, padding, - layout, out_dtype)}) - return unpack - - def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): # fetch schedule ic_bn, oc_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], @@ -166,7 +116,7 @@ def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, outpu s[CC].compute_at(s[C], oh_outer) _, oc_chunk, oh, ow, oc_block = s[CC].op.axis - ic, = s[CC].op.reduce_axis + ic, _, _ = s[CC].op.reduce_axis ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) @@ -197,52 +147,6 @@ def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, outpu return s -def _declaration_conv_NCHWc(cfg, data, kernel, kernel_size, strides, padding, layout, - out_layout, out_dtype): - HPAD, WPAD = padding - HSTR, WSTR = strides - - n, ic_chunk, ih, iw, ic_block = get_const_tuple(data.shape) - ic = ic_chunk * ic_block - kh, kw = kernel_size - oc_chunk, _, _, oc_block, _, _ = get_const_tuple(kernel.shape) - oc = oc_chunk * oc_block - oh = (ih + 2 * HPAD - kh) // HSTR + 1 - ow = (iw + 2 * WPAD - kw) // WSTR + 1 - - DOPAD = (HPAD != 0 or WPAD != 0) - if DOPAD: - data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") - else: - data_pad = data - - # fetch schedule - ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - if ic_bn != ic_block: - raise RuntimeError("ic_bn in config is not equal to actual data ic_block: %d vs %d." - % (ic_bn, ic_block)) - if oc_bn != oc_block: - raise RuntimeError("oc_bn in config is not equal to actual kernel oc_block: %d vs %d." - % (oc_bn, oc_block)) - - # convolution - workload = topi.x86.conv2d.conv_NCHWc_arg_to_workload(data, kernel, - kernel_size, - strides, padding, - layout, out_layout, - out_dtype), - attrs = {'workload': workload} - oshape = (n, oc//oc_bn, oh, ow, oc_bn) - ic = tvm.reduce_axis((0, ic), name='ic') - conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR, ow*WSTR, - ic%ic_bn].astype(out_dtype) * - kernel[oc_chunk, ic // ic_bn, ic % ic_bn, oc_block, 0, 0], - axis=[ic]), - name='conv2d_NCHWc', tag='conv2d_NCHWc', attrs=attrs) - return conv - - def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): # fetch schedule ic_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oh"].val, @@ -270,7 +174,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): s[C].parallel(parallel_axis) _, oc_chunk, oh, ow, oc_block = s[CC].op.axis - ic, = s[CC].op.reduce_axis + ic, _, _ = s[CC].op.reduce_axis ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 56a5b6155ef5..88ada27298de 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -77,71 +77,6 @@ def _fallback_schedule(wkl, simd_width): return ConfigEntity.from_json_dict(cfg_dict) -def _declaration_conv(cfg, data, kernel, strides, padding, layout, out_dtype): - out_dtype = data.dtype if out_dtype is None else out_dtype - assert layout == 'NCHW', "only support NCHW convolution for AVX" - - HPAD, WPAD = padding - HSTR, WSTR = strides - - batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) - num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) - - pad_height = in_height + 2 * HPAD - pad_width = in_width + 2 * WPAD - - out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 - out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 - - # pack data - DOPAD = (HPAD != 0 or WPAD != 0) - if DOPAD: - data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") - else: - data_pad = data - - # fetch schedule - ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - - shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width) - data_vec = tvm.compute(shape, - lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w], - name='data_vec') - - # pack kernel - shape = (num_filter//oc_bn, in_channel//ic_bn, - kernel_height, kernel_width, ic_bn, oc_bn) - kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co: - kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w], - name='kernel_vec') - - # convolution - oshape = (batch_size, num_filter//oc_bn, out_height, out_width, oc_bn) - unpack_shape = (batch_size, num_filter, out_height, out_width) - - ic = tvm.reduce_axis((0, in_channel), name='ic') - kh = tvm.reduce_axis((0, kernel_height), name='kh') - kw = tvm.reduce_axis((0, kernel_width), name='kw') - - conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR+kh, ic%ic_bn, - ow*WSTR+kw].astype(out_dtype) * - kernel_vec[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, - oc_block].astype(out_dtype), - axis=[ic, kh, kw]), name='conv') - - unpack = tvm.compute(unpack_shape, - lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn] - .astype(out_dtype), - name='output_unpack', - tag='conv2d_nchw', - attrs={'workload': - topi.x86.conv2d.conv_arg_to_workload(data, kernel, - strides, padding, - layout, out_dtype)}) - return unpack - - def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], @@ -213,57 +148,6 @@ def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, outpu return s -def _declaration_conv_NCHWc(cfg, data, kernel, kernel_size, strides, padding, layout, - out_layout, out_dtype): - HPAD, WPAD = padding - HSTR, WSTR = strides - - n, ic_chunk, ih, iw, ic_block = get_const_tuple(data.shape) - ic = ic_chunk * ic_block - kh, kw = kernel_size - oc_chunk, _, _, _, _, oc_block = get_const_tuple(kernel.shape) - oc = oc_chunk * oc_block - oh = (ih + 2 * HPAD - kh) // HSTR + 1 - ow = (iw + 2 * WPAD - kw) // WSTR + 1 - - # DOPAD - DOPAD = (HPAD != 0 or WPAD != 0) - if DOPAD: - data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") - else: - data_pad = data - - # fetch schedule - ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - if ic_bn != ic_block: - raise RuntimeError("ic_bn in config is not equal to actual data ic_block: %d vs %d." - % (ic_bn, ic_block)) - if oc_bn != oc_block: - raise RuntimeError("oc_bn in config is not equal to actual kernel oc_block: %d vs %d." - % (oc_bn, oc_block)) - - # convolution - oshape = (n, oc//oc_bn, oh, ow, oc_bn) - - ic = tvm.reduce_axis((0, ic), name='ic') - kh = tvm.reduce_axis((0, kernel_size[0]), name='kh') - kw = tvm.reduce_axis((0, kernel_size[1]), name='kw') - - workload = topi.x86.conv2d.conv_NCHWc_arg_to_workload(data, kernel, - kernel_size, - strides, padding, - layout, out_layout, - out_dtype), - attrs = {'workload': workload} - conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw, - ic%ic_bn].astype(out_dtype) * - kernel[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, oc_block], - axis=[ic, kh, kw]), - name='conv2d_NCHWc', tag="conv2d_NCHWc", attrs=attrs) - return conv - - def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): # fetch schedule ic_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_ow"].size[-1], From 9d442014d29f5c7ee429bf485c3495228f05cf8c Mon Sep 17 00:00:00 2001 From: Wang Date: Tue, 16 Oct 2018 12:19:08 -0700 Subject: [PATCH 12/13] Fix lint --- topi/python/topi/x86/conv2d_avx_1x1.py | 1 - topi/python/topi/x86/conv2d_avx_common.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index b3b0166ae111..96affc7b9d23 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -7,7 +7,6 @@ import topi -from ..util import get_const_tuple from ..nn.util import infer_pad from ..nn.pad import pad from .tensor_intrin import dot_16x1x16_int8_int8_int32 diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 88ada27298de..eaa3d15e64b0 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -5,9 +5,6 @@ import tvm from tvm.autotvm.task import ConfigEntity -import topi - -from ..util import get_const_tuple from ..nn.util import infer_pad from ..nn.pad import pad from .tensor_intrin import dot_16x1x16_int8_int8_int32 From f959a4168081dd2cbe2600455e1d9ebe28fe0485 Mon Sep 17 00:00:00 2001 From: Wang Date: Tue, 16 Oct 2018 13:12:19 -0700 Subject: [PATCH 13/13] Minor fix --- topi/python/topi/x86/conv2d.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 460befd80f0b..f766d827686d 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -651,11 +651,9 @@ def _declaration_conv_NCHWc_impl(cfg, data, kernel, kernel_size, strides, paddin kh = tvm.reduce_axis((0, kernel_size[0]), name='kh') kw = tvm.reduce_axis((0, kernel_size[1]), name='kw') - workload = conv2d.conv_NCHWc_arg_to_workload(data, kernel, - kernel_size, - strides, padding, - layout, out_layout, - out_dtype), + workload = conv_NCHWc_arg_to_workload(data, kernel, kernel_size, + strides, padding, layout, + out_layout, out_dtype), attrs = {'workload': workload} conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw,