From d572a5567a2a860696f2422b2029a763be950477 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Mon, 21 Dec 2020 15:33:59 +0000 Subject: [PATCH 01/28] integrated with v0.8 --- python/tvm/relay/op/strategy/x86.py | 8 +- python/tvm/topi/arm_cpu/__init__.py | 1 + python/tvm/topi/arm_cpu/group_conv2d.py | 342 +++++++++++++++++++++++ python/tvm/topi/x86/__init__.py | 1 + python/tvm/topi/x86/group_conv2d.py | 343 ++++++++++++++++++++++++ 5 files changed, 690 insertions(+), 5 deletions(-) create mode 100644 python/tvm/topi/arm_cpu/group_conv2d.py create mode 100644 python/tvm/topi/x86/group_conv2d.py diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 1f37a4f8e98c..60bd92ef63d1 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -205,12 +205,10 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): else: # group_conv2d if layout == "NCHW": assert kernel_layout == "OIHW" - if not is_auto_scheduler_enabled(): - logger.warning("group_conv2d is not optimized for x86 with autotvm.") strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), - name="group_conv2d_nchw.generic", + wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.x86", ) elif layout == "NHWC": assert kernel_layout == "HWIO" diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py index e121fbc7ec6d..9e2057a7126f 100644 --- a/python/tvm/topi/arm_cpu/__init__.py +++ b/python/tvm/topi/arm_cpu/__init__.py @@ -26,3 +26,4 @@ from .bitserial_dense import * from .injective import * from . import cortex_m7 +from .group_conv2d import * diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py new file mode 100644 index 000000000000..bf0925698ec5 --- /dev/null +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -0,0 +1,342 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" + +import tvm +from tvm import autotvm +from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + +from ..utils import get_const_tuple +from ..nn.pad import pad +from .. import tag + +from ..nn.utils import infer_pad +from ..nn.conv2d import _get_workload as _get_conv2d_workload + + + + +def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, + out_dtype): + """Compute group_conv2d with NCHW layout""" + return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding, + dilation, groups, out_dtype) + + +def schedule_group_conv2d_nchw(outs): + """Compute group_conv2d with NCHW layout""" + return schedule_group_conv2d_nchwc(outs) + + +def _get_default_config(cfg, data, kernel, strides, padding, groups, + out_dtype, layout='NCHW'): + """ + Get default schedule config for the workload + """ + static_data_shape = [] + for dim in get_const_tuple(data.shape): + if isinstance(dim, tvm.tir.Var): + static_data_shape.append(1) + else: + static_data_shape.append(dim) + data = te.placeholder(static_data_shape, dtype=data.dtype) + + wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, + layout) + _fallback_schedule(cfg, wkl) + + +def _fallback_schedule(cfg, wkl): + simd_width = 4 # assume ARM SIMD Width is 4 + pad_left, pad_right = wkl.padl, wkl.padr + stride_w = wkl.wstride + out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 + groups = wkl.groups + kernels_per_group = wkl.out_filter // groups + kernel_depth = wkl.in_filter // groups + + oc_bn = 1 + for bn in range(simd_width, 0, -1): + if kernels_per_group % bn == 0: + oc_bn = bn + break + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group + + ic_bn = 1 + for bn in range(oc_bn, 0, -1): + if kernel_depth % bn == 0: + ic_bn = bn + break + if ic_bn > kernel_depth: + ic_bn = kernel_depth + + reg_n = 1 + for n in range(31, 0, -1): + if out_width % n == 0: + reg_n = n + break + + cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) + cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) + cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) + cfg["unroll_kw"] = OtherOptionEntity(False) + + +@autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu") +def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, + dilation, groups, out_dtype='float32'): + """ + Compute group conv2d with NCHW layout, using GSPC algorithm. + https://arxiv.org/abs/2006.09791 + """ + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(dilation, int): + dilation_h, dilation_w = dilation, dilation + else: + dilation_h, dilation_w = dilation + + assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 + if isinstance(padding, int): + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + elif len(padding) == 2: + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad + else: + pad_top, pad_left, pad_bottom, pad_right = padding + + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right + + assert isinstance(strides, int) or len(strides) == 2 + if isinstance(strides, int): + stride_h, stride_w = strides, strides + else: + stride_h, stride_w = strides + + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right + + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + + kernels_per_group = out_channel // groups + + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_knob("unroll_kw", [True, False]) + + # If no config was set, we can fallback to default config. + if cfg.is_fallback: + _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), + dtype=data.dtype), + te.placeholder((out_channel, in_channel // groups, k_height, k_width), + dtype=kernel.dtype), + strides, padding, groups, out_dtype) + + oc_bn = cfg['tile_oc'].size[-1] + ic_bn = cfg['tile_ic'].size[-1] + # pack data + DOPAD = (hpad != 0 or wpad != 0) + if DOPAD: + data_pad = pad(data, + (0, 0, pad_top, pad_left), + (0, 0, pad_bottom, pad_right), + name="data_pad") + else: + data_pad = data + + shape = (groups, batch_size, kernel_depth // ic_bn, + pad_height, ic_bn, pad_width) + + data_vec = te.compute(shape, + lambda g, n, C, h, c, w: + data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + name='data_vec') + + # pack kernel + shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, + k_height, k_width, ic_bn, oc_bn) + kernel_vec = te.compute(shape, + lambda g, out_channel, in_channel, h, w, ci, co: + kernel[(out_channel * oc_bn + co + g * kernels_per_group), + in_channel * ic_bn + ci, h, w], + name='kernel_vec') + + # convolution + oshape = (groups, batch_size, kernels_per_group//oc_bn, + out_height, out_width, oc_bn) + unpack_shape = (batch_size, out_channel, out_height, out_width) + + ic = te.reduce_axis((0, (kernel_depth)), name='ic') + kh = te.reduce_axis((0, k_height), name='kh') + kw = te.reduce_axis((0, k_width), name='kw') + idxmod = tvm.tir.indexmod + idxdiv = tvm.tir.indexdiv + + conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: + te.sum(data_vec[g, n, idxdiv(ic, ic_bn), + oh * stride_h + kh * dilation_h, + idxmod(ic, ic_bn), + ow * stride_w + kw * dilation_w].astype(out_dtype) * + kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), + kh, kw, idxmod(ic, ic_bn), + oc_block].astype(out_dtype), + axis=[ic, kh, kw]), name='conv') + + unpack = te.compute(unpack_shape, + lambda n, c, h, w: + conv[idxdiv(c, kernels_per_group), n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + h, w, + idxmod(idxmod(c, oc_bn), kernels_per_group)] + .astype(out_dtype), + name='output_unpack', + tag='group_conv2d_nchw') + return unpack + + +@autotvm.register_topi_schedule("group_conv2d_nchw.arm_cpu") +def schedule_group_conv2d_nchwc(cfg, outs): + """Create schedule for tensors""" + s = te.create_schedule([x.op for x in outs]) + scheduled_ops = [] + + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops: + traverse(tensor.op) + + if 'group_conv2d_nchw' in op.tag: + output = op.output(0) + + if "tile_ic" not in cfg: + return + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel = kernel_vec.op.input_tensors[0] + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + data_vec = conv_out.op.input_tensors[0] + data = data_vec.op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, + output, outs[0]] + _schedule_gspc_nchw(*args) + + scheduled_ops.append(op) + + traverse(outs[0].op) + return s + + +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, + conv_out, output, last): + """Schedule GSPC""" + ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) + + # no stride and padding info here + padding = infer_pad(data, data_pad) + hpad, wpad = padding + DOPAD = (hpad != 0 or wpad != 0) + + _, W = data, kernel_vec + A0, A1 = data_pad, data_vec + + # schedule data + if DOPAD: + s[A0].compute_inline() + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis + + parallel_axis = s[A1].fuse(batch, ic_chunk, ih) + s[A1].parallel(parallel_axis) + + # schedule kernel pack + groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis + s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + + if oc_bn > 1: + s[W].vectorize(oc_block) + + parallel_axis = s[W].fuse(groups, oc_chunk, oh) + s[W].parallel(parallel_axis) + + # schedule conv + C, O0, O = conv_out, output, last + CC = s.cache_write(C, 'global') + + _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis + + ow_chunk, ow_block = s[C].split(ow, factor=reg_n) + + s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + s[C].fuse(oc_chunk, oh) + s[C].vectorize(oc_block) + + groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis + + ic, kh, kw = s[CC].op.reduce_axis + ow_chunk, ow_block = s[CC].split(ow, factor=reg_n) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) + + if unroll_kw: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, + ow_block, oc_block) + s[CC].unroll(kw) + else: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, + ow_block, oc_block) + + parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) + s[CC].parallel(parallel_axis) + + s[CC].vectorize(oc_block) + + s[CC].unroll(ow_block) + + if O0 != O: + s[O0].compute_inline() + + batch, oc, oh, ow = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + + s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + return s diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py index bb6a7cdd4122..d1bd58dd4831 100644 --- a/python/tvm/topi/x86/__init__.py +++ b/python/tvm/topi/x86/__init__.py @@ -41,3 +41,4 @@ from .conv2d_alter_op import * from .dense_alter_op import * from .scatter import * +from .group_conv2d import * diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py new file mode 100644 index 000000000000..158795eb4e17 --- /dev/null +++ b/python/tvm/topi/x86/group_conv2d.py @@ -0,0 +1,343 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" + +import tvm +from tvm import autotvm +from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + +from .utils import get_fp32_len +from ..utils import get_const_tuple +from ..nn.pad import pad +from .. import tag + +from ..nn.utils import infer_pad +from ..nn.conv2d import _get_workload as _get_conv2d_workload + + +def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, + out_dtype): + """Compute group_conv2d with NCHW layout""" + return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding, + dilation, groups, out_dtype) + + +def schedule_group_conv2d_nchw(outs): + """Compute group_conv2d with NCHW layout""" + return schedule_group_conv2d_nchwc(outs) + + +def _get_default_config(cfg, data, kernel, strides, padding, groups, + out_dtype, layout='NCHW'): + """ + Get default schedule config for the workload + """ + static_data_shape = [] + for dim in get_const_tuple(data.shape): + if isinstance(dim, tvm.tir.Var): + static_data_shape.append(1) + else: + static_data_shape.append(dim) + data = te.placeholder(static_data_shape, dtype=data.dtype) + + wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, + layout) + _fallback_schedule(cfg, wkl) + + +def _fallback_schedule(cfg, wkl): + simd_width = get_fp32_len() + hpad = wkl.hpad + stride_w = wkl.wstride + out_width = (wkl.width + 2*hpad - wkl.wkernel) // stride_w + 1 + groups = wkl.groups + kernels_per_group = wkl.out_filter // groups + kernel_depth = wkl.in_filter // groups + + oc_bn = 1 + for bn in range(simd_width, 0, -1): + if kernels_per_group % bn == 0: + oc_bn = bn + break + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group + + ic_bn = 1 + for bn in range(oc_bn, 0, -1): + if kernel_depth % bn == 0: + ic_bn = bn + break + if ic_bn > kernel_depth: + ic_bn = kernel_depth + + reg_n = 1 + for n in range(31, 0, -1): + if out_width % n == 0: + reg_n = n + break + + cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) + cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) + cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) + cfg["unroll_kw"] = OtherOptionEntity(False) + + +@autotvm.register_topi_compute("group_conv2d_nchw.x86") +def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, + dilation, groups, out_dtype='float32'): + """ + Compute group conv2d with NCHW layout, using GSPC algorithm. + https://arxiv.org/abs/2006.09791 + """ + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(dilation, int): + dilation_h, dilation_w = dilation, dilation + else: + dilation_h, dilation_w = dilation + + assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 + if isinstance(padding, int): + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + elif len(padding) == 2: + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad + else: + pad_top, pad_left, pad_bottom, pad_right = padding + + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right + + assert isinstance(strides, int) or len(strides) == 2 + if isinstance(strides, int): + stride_h, stride_w = strides, strides + else: + stride_h, stride_w = strides + + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right + + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + + kernels_per_group = out_channel // groups + + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_knob("unroll_kw", [True, False]) + + # If no config was set, we can fallback to default config. + if cfg.is_fallback: + _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), + dtype=data.dtype), + te.placeholder((out_channel, in_channel // groups, k_height, k_width), + dtype=kernel.dtype), + strides, padding, groups, out_dtype) + + oc_bn = cfg['tile_oc'].size[-1] + ic_bn = cfg['tile_ic'].size[-1] + # pack data + DOPAD = (hpad != 0 or wpad != 0) + if DOPAD: + data_pad = pad(data, + (0, 0, pad_top, pad_left), + (0, 0, pad_bottom, pad_right), + name="data_pad") + else: + data_pad = data + + shape = (groups, batch_size, kernel_depth // ic_bn, + pad_height, ic_bn, pad_width) + + data_vec = te.compute(shape, + lambda g, n, C, h, c, w: + data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + name='data_vec') + + # pack kernel + shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, + k_height, k_width, ic_bn, oc_bn) + kernel_vec = te.compute(shape, + lambda g, out_channel, in_channel, h, w, ci, co: + kernel[(out_channel * oc_bn + co + g * kernels_per_group), + in_channel * ic_bn + ci, h, w], + name='kernel_vec') + + # convolution + oshape = (groups, batch_size, kernels_per_group//oc_bn, + out_height, out_width, oc_bn) + unpack_shape = (batch_size, out_channel, out_height, out_width) + + ic = te.reduce_axis((0, (kernel_depth)), name='ic') + kh = te.reduce_axis((0, k_height), name='kh') + kw = te.reduce_axis((0, k_width), name='kw') + idxmod = tvm.tir.indexmod + idxdiv = tvm.tir.indexdiv + + conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: + te.sum(data_vec[g, n, idxdiv(ic, ic_bn), + oh * stride_h + kh * dilation_h, + idxmod(ic, ic_bn), + ow * stride_w + kw * dilation_w].astype(out_dtype) * + kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), + kh, kw, idxmod(ic, ic_bn), + oc_block].astype(out_dtype), + axis=[ic, kh, kw]), name='conv') + + unpack = te.compute(unpack_shape, + lambda n, c, h, w: + conv[idxdiv(c, kernels_per_group), n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + h, w, + idxmod(idxmod(c, oc_bn), kernels_per_group)] + .astype(out_dtype), + name='output_unpack', + tag='group_conv2d_nchw') + return unpack + + +@autotvm.register_topi_schedule("group_conv2d_nchw.x86") +def schedule_group_conv2d_nchwc(cfg, outs): + """Create schedule for tensors""" + s = te.create_schedule([x.op for x in outs]) + scheduled_ops = [] + + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops: + traverse(tensor.op) + + if 'group_conv2d_nchw' in op.tag: + output = op.output(0) + + if "tile_ic" not in cfg: + return + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel = kernel_vec.op.input_tensors[0] + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + data_vec = conv_out.op.input_tensors[0] + data = data_vec.op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, + output, outs[0]] + _schedule_gspc_nchw(*args) + + scheduled_ops.append(op) + + traverse(outs[0].op) + return s + + +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, + conv_out, output, last): + """Schedule GSPC""" + ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) + + # no stride and padding info here + padding = infer_pad(data, data_pad) + hpad, wpad = padding + DOPAD = (hpad != 0 or wpad != 0) + + _, W = data, kernel_vec + A0, A1 = data_pad, data_vec + + # schedule data + if DOPAD: + s[A0].compute_inline() + + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis + + parallel_axis = s[A1].fuse(batch, ic_chunk, ih) + s[A1].parallel(parallel_axis) + + # schedule kernel pack + groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis + s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + + if oc_bn > 1: + s[W].vectorize(oc_block) + + parallel_axis = s[W].fuse(groups, oc_chunk, oh) + s[W].parallel(parallel_axis) + + # schedule conv + C, O0, O = conv_out, output, last + CC = s.cache_write(C, 'global') + + _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis + + ow_chunk, ow_block = s[C].split(ow, factor=reg_n) + + s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + s[C].fuse(oc_chunk, oh) + s[C].vectorize(oc_block) + + groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis + + ic, kh, kw = s[CC].op.reduce_axis + ow_chunk, ow_block = s[CC].split(ow, factor=reg_n) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) + + if unroll_kw: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, + ow_block, oc_block) + s[CC].unroll(kw) + else: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, + ow_block, oc_block) + + parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) + + s[CC].parallel(parallel_axis) + + s[CC].vectorize(oc_block) + + s[CC].unroll(ow_block) + + if O0 != O: + s[O0].compute_inline() + + batch, oc, oh, ow = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + + s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + return s From f32132bed42e71d2f00e1e1e75d2d168177de18c Mon Sep 17 00:00:00 2001 From: Wheest Date: Sun, 26 Jul 2020 15:16:05 +0100 Subject: [PATCH 02/28] Rebase, and undoing accidental removal of auto scheduler NHWC support --- python/tvm/relay/op/strategy/arm_cpu.py | 7 +- python/tvm/topi/arm_cpu/group_conv2d.py | 168 ++++++++++------------- python/tvm/topi/x86/group_conv2d.py | 172 ++++++++++-------------- 3 files changed, 141 insertions(+), 206 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 985124e305ee..36200bb024a9 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -207,11 +207,10 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): else: # group_conv2d if layout == "NCHW": assert kernel_layout == "OIHW" - logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.") strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), - name="group_conv2d_nchw.generic", + wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.arm_cpu" ) elif layout == "NHWC": assert kernel_layout == "HWIO" diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index bf0925698ec5..c8ae9bc46fbe 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -1,36 +1,15 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member -# pylint: disable=no-value-for-parameter,import-outside-toplevel -"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" - import tvm from tvm import autotvm from tvm import te -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity - -from ..utils import get_const_tuple +from ..util import get_const_tuple from ..nn.pad import pad from .. import tag -from ..nn.utils import infer_pad +from ..nn.conv2d import group_conv2d_nchw +from ..nn.util import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload - +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, @@ -45,8 +24,8 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, - out_dtype, layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, + layout='NCHW'): """ Get default schedule config for the workload """ @@ -65,28 +44,24 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, def _fallback_schedule(cfg, wkl): simd_width = 4 # assume ARM SIMD Width is 4 - pad_left, pad_right = wkl.padl, wkl.padr - stride_w = wkl.wstride - out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 - groups = wkl.groups - kernels_per_group = wkl.out_filter // groups - kernel_depth = wkl.in_filter // groups - + HPAD, WPAD = wkl.hpad, wkl.wpad + HSTR, WSTR = wkl.hstride, wkl.wstride + out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + G = wkl.groups + KPG = wkl.out_filter // G + CPG = wkl.in_filter // G oc_bn = 1 + for bn in range(simd_width, 0, -1): - if kernels_per_group % bn == 0: + if KPG % bn == 0: oc_bn = bn break - if oc_bn > kernels_per_group: - oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if kernel_depth % bn == 0: + if CPG % bn == 0: ic_bn = bn break - if ic_bn > kernel_depth: - ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -103,10 +78,6 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu") def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'): - """ - Compute group conv2d with NCHW layout, using GSPC algorithm. - https://arxiv.org/abs/2006.09791 - """ assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(dilation, int): dilation_h, dilation_w = dilation, dilation @@ -115,94 +86,87 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + HPAD, WPAD = padding, padding elif len(padding) == 2: - hpad, wpad = padding - pad_top, pad_bottom = hpad, hpad - pad_left, pad_right = wpad, wpad + HPAD, WPAD = padding else: - pad_top, pad_left, pad_bottom, pad_right = padding - - hpad = pad_top + pad_bottom - wpad = pad_left + pad_right + HPAD, _, WPAD, _ = padding assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - stride_h, stride_w = strides, strides + HSTR, WSTR = strides, strides else: - stride_h, stride_w = strides + HSTR, WSTR = strides - batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) - out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + N, CI, IH, IW = get_const_tuple(data.shape) + CO, CIG, KH, KW = get_const_tuple(kernel.shape) - pad_height = in_height + pad_top + pad_bottom - pad_width = in_width + pad_left + pad_right + pad_height = IH + 2 * HPAD + pad_width = IW + 2 * WPAD - dilated_kernel_h = (k_height - 1) * dilation_h + 1 - dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + dilated_kernel_h = (KH - 1) * dilation_h + 1 + dilated_kernel_w = (KW - 1) * dilation_w + 1 + OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 + OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 - kernels_per_group = out_channel // groups + G = groups + KPG = CO // G + CPG = CI // G - cfg.define_split("tile_ic", in_channel, num_outputs=2) - cfg.define_split("tile_oc", out_channel, num_outputs=2) - cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", CI, num_outputs=2) + cfg.define_split("tile_oc", CO, num_outputs=2) + cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), - dtype=data.dtype), - te.placeholder((out_channel, in_channel // groups, k_height, k_width), + _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype), + te.placeholder((N, CI // G, KH, KW), dtype=kernel.dtype), strides, padding, groups, out_dtype) oc_bn = cfg['tile_oc'].size[-1] ic_bn = cfg['tile_ic'].size[-1] # pack data - DOPAD = (hpad != 0 or wpad != 0) + DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: - data_pad = pad(data, - (0, 0, pad_top, pad_left), - (0, 0, pad_bottom, pad_right), - name="data_pad") + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data - shape = (groups, batch_size, kernel_depth // ic_bn, + shape = (G, N, CPG // ic_bn, pad_height, ic_bn, pad_width) data_vec = te.compute(shape, lambda g, n, C, h, c, w: - data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + data_pad[n, C * ic_bn + c + CPG * g, h, w], name='data_vec') # pack kernel - shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, - k_height, k_width, ic_bn, oc_bn) + shape = (G, KPG//oc_bn, CPG//ic_bn, + KH, KW, ic_bn, oc_bn) kernel_vec = te.compute(shape, - lambda g, out_channel, in_channel, h, w, ci, co: - kernel[(out_channel * oc_bn + co + g * kernels_per_group), - in_channel * ic_bn + ci, h, w], + lambda g, CO, CI, h, w, ci, co: + kernel[(CO * oc_bn + co + g * KPG), + CI * ic_bn + ci, h, w], name='kernel_vec') # convolution - oshape = (groups, batch_size, kernels_per_group//oc_bn, - out_height, out_width, oc_bn) - unpack_shape = (batch_size, out_channel, out_height, out_width) + oshape = (G, N, KPG//oc_bn, + OH, OW, oc_bn) + unpack_shape = (N, CO, OH, OW) - ic = te.reduce_axis((0, (kernel_depth)), name='ic') - kh = te.reduce_axis((0, k_height), name='kh') - kw = te.reduce_axis((0, k_width), name='kw') + ic = te.reduce_axis((0, (CPG)), name='ic') + kh = te.reduce_axis((0, KH), name='kh') + kw = te.reduce_axis((0, KW), name='kw') idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(data_vec[g, n, idxdiv(ic, ic_bn), - oh * stride_h + kh * dilation_h, + oh*HSTR+kh*dilation_h, idxmod(ic, ic_bn), - ow * stride_w + kw * dilation_w].astype(out_dtype) * + ow*WSTR+kw*dilation_w].astype(out_dtype) * kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(out_dtype), @@ -210,10 +174,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, unpack = te.compute(unpack_shape, lambda n, c, h, w: - conv[idxdiv(c, kernels_per_group), n, - idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + conv[idxdiv(c, KPG), n, + idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)), h, w, - idxmod(idxmod(c, oc_bn), kernels_per_group)] + idxmod(idxmod(c, oc_bn), KPG)] .astype(out_dtype), name='output_unpack', tag='group_conv2d_nchw') @@ -253,9 +217,12 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] + _, c, h, w = get_const_tuple(data.shape) + _, x, kh, kw = get_const_tuple(kernel.shape) + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] - _schedule_gspc_nchw(*args) + schedule_conv_sp_grouped(*args) scheduled_ops.append(op) @@ -263,24 +230,25 @@ def traverse(op): return s -def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last): - """Schedule GSPC""" +def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec, + conv_out, output, last, + **kwargs): + # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) # no stride and padding info here padding = infer_pad(data, data_pad) - hpad, wpad = padding - DOPAD = (hpad != 0 or wpad != 0) + HPAD, WPAD = padding + DOPAD = (HPAD != 0 or WPAD != 0) - _, W = data, kernel_vec + A, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data if DOPAD: s[A0].compute_inline() - groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis + groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) s[A1].parallel(parallel_axis) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 158795eb4e17..310257099e8c 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -1,36 +1,17 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member -# pylint: disable=no-value-for-parameter,import-outside-toplevel -"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" - import tvm from tvm import autotvm from tvm import te -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity - -from .utils import get_fp32_len -from ..utils import get_const_tuple +from .util import get_fp32_len +from ..util import get_const_tuple from ..nn.pad import pad from .. import tag -from ..nn.utils import infer_pad +from ..nn.conv2d import group_conv2d_nchw +from ..nn.util import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): @@ -44,8 +25,8 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, - out_dtype, layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, + layout='NCHW'): """ Get default schedule config for the workload """ @@ -64,28 +45,24 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - hpad = wkl.hpad - stride_w = wkl.wstride - out_width = (wkl.width + 2*hpad - wkl.wkernel) // stride_w + 1 - groups = wkl.groups - kernels_per_group = wkl.out_filter // groups - kernel_depth = wkl.in_filter // groups - + HPAD, WPAD = wkl.hpad, wkl.wpad + HSTR, WSTR = wkl.hstride, wkl.wstride + out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + G = wkl.groups + KPG = wkl.out_filter // G + CPG = wkl.in_filter // G oc_bn = 1 + for bn in range(simd_width, 0, -1): - if kernels_per_group % bn == 0: + if KPG % bn == 0: oc_bn = bn break - if oc_bn > kernels_per_group: - oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if kernel_depth % bn == 0: + if CPG % bn == 0: ic_bn = bn break - if ic_bn > kernel_depth: - ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -102,10 +79,6 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.x86") def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'): - """ - Compute group conv2d with NCHW layout, using GSPC algorithm. - https://arxiv.org/abs/2006.09791 - """ assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(dilation, int): dilation_h, dilation_w = dilation, dilation @@ -114,94 +87,87 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + HPAD, WPAD = padding, padding elif len(padding) == 2: - hpad, wpad = padding - pad_top, pad_bottom = hpad, hpad - pad_left, pad_right = wpad, wpad + HPAD, WPAD = padding else: - pad_top, pad_left, pad_bottom, pad_right = padding - - hpad = pad_top + pad_bottom - wpad = pad_left + pad_right + HPAD, _, WPAD, _ = padding assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - stride_h, stride_w = strides, strides + HSTR, WSTR = strides, strides else: - stride_h, stride_w = strides + HSTR, WSTR = strides - batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) - out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + N, CI, IH, IW = get_const_tuple(data.shape) + CO, CIG, KH, KW = get_const_tuple(kernel.shape) - pad_height = in_height + pad_top + pad_bottom - pad_width = in_width + pad_left + pad_right + pad_height = IH + 2 * HPAD + pad_width = IW + 2 * WPAD - dilated_kernel_h = (k_height - 1) * dilation_h + 1 - dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + dilated_kernel_h = (KH - 1) * dilation_h + 1 + dilated_kernel_w = (KW - 1) * dilation_w + 1 + OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 + OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 - kernels_per_group = out_channel // groups + G = groups + KPG = CO // G + CPG = CI // G - cfg.define_split("tile_ic", in_channel, num_outputs=2) - cfg.define_split("tile_oc", out_channel, num_outputs=2) - cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", CI, num_outputs=2) + cfg.define_split("tile_oc", CO, num_outputs=2) + cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), - dtype=data.dtype), - te.placeholder((out_channel, in_channel // groups, k_height, k_width), + _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype), + te.placeholder((N, CI // G, KH, KW), dtype=kernel.dtype), strides, padding, groups, out_dtype) oc_bn = cfg['tile_oc'].size[-1] ic_bn = cfg['tile_ic'].size[-1] # pack data - DOPAD = (hpad != 0 or wpad != 0) + DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: - data_pad = pad(data, - (0, 0, pad_top, pad_left), - (0, 0, pad_bottom, pad_right), - name="data_pad") + data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data - shape = (groups, batch_size, kernel_depth // ic_bn, + shape = (G, N, CPG // ic_bn, pad_height, ic_bn, pad_width) data_vec = te.compute(shape, lambda g, n, C, h, c, w: - data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + data_pad[n, C * ic_bn + c + CPG * g, h, w], name='data_vec') # pack kernel - shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, - k_height, k_width, ic_bn, oc_bn) + shape = (G, KPG//oc_bn, CPG//ic_bn, + KH, KW, ic_bn, oc_bn) kernel_vec = te.compute(shape, - lambda g, out_channel, in_channel, h, w, ci, co: - kernel[(out_channel * oc_bn + co + g * kernels_per_group), - in_channel * ic_bn + ci, h, w], + lambda g, CO, CI, h, w, ci, co: + kernel[(CO * oc_bn + co + g * KPG), + CI * ic_bn + ci, h, w], name='kernel_vec') # convolution - oshape = (groups, batch_size, kernels_per_group//oc_bn, - out_height, out_width, oc_bn) - unpack_shape = (batch_size, out_channel, out_height, out_width) + oshape = (G, N, KPG//oc_bn, + OH, OW, oc_bn) + unpack_shape = (N, CO, OH, OW) - ic = te.reduce_axis((0, (kernel_depth)), name='ic') - kh = te.reduce_axis((0, k_height), name='kh') - kw = te.reduce_axis((0, k_width), name='kw') + ic = te.reduce_axis((0, (CPG)), name='ic') + kh = te.reduce_axis((0, KH), name='kh') + kw = te.reduce_axis((0, KW), name='kw') idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(data_vec[g, n, idxdiv(ic, ic_bn), - oh * stride_h + kh * dilation_h, + oh*HSTR+kh*dilation_h, idxmod(ic, ic_bn), - ow * stride_w + kw * dilation_w].astype(out_dtype) * + ow*WSTR+kw*dilation_w].astype(out_dtype) * kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(out_dtype), @@ -209,10 +175,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, unpack = te.compute(unpack_shape, lambda n, c, h, w: - conv[idxdiv(c, kernels_per_group), n, - idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + conv[idxdiv(c, KPG), n, + idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)), h, w, - idxmod(idxmod(c, oc_bn), kernels_per_group)] + idxmod(idxmod(c, oc_bn), KPG)] .astype(out_dtype), name='output_unpack', tag='group_conv2d_nchw') @@ -252,9 +218,12 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] + _, c, h, w = get_const_tuple(data.shape) + _, x, kh, kw = get_const_tuple(kernel.shape) + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] - _schedule_gspc_nchw(*args) + schedule_conv_sp_grouped(*args) scheduled_ops.append(op) @@ -262,25 +231,25 @@ def traverse(op): return s -def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last): - """Schedule GSPC""" +def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec, + conv_out, output, last, + **kwargs): + # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) # no stride and padding info here padding = infer_pad(data, data_pad) - hpad, wpad = padding - DOPAD = (hpad != 0 or wpad != 0) + HPAD, WPAD = padding + DOPAD = (HPAD != 0 or WPAD != 0) - _, W = data, kernel_vec + A, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data if DOPAD: s[A0].compute_inline() - - groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis + groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) s[A1].parallel(parallel_axis) @@ -322,7 +291,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) - s[CC].parallel(parallel_axis) s[CC].vectorize(oc_block) From ca5c67f02aa90f276317e8a80187547ac11a1b1b Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 27 Aug 2020 17:01:36 +0100 Subject: [PATCH 03/28] Added ASF license header --- python/tvm/topi/arm_cpu/group_conv2d.py | 17 +++++++++++++++++ python/tvm/topi/x86/group_conv2d.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index c8ae9bc46fbe..d2f93b381f77 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import tvm from tvm import autotvm from tvm import te diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 310257099e8c..2afcc1b1e200 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import tvm from tvm import autotvm from tvm import te From 737d36f05cf0947d09188065b474edbef3a5dd79 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 27 Aug 2020 17:44:40 +0100 Subject: [PATCH 04/28] Minor bug fixes --- python/tvm/topi/x86/group_conv2d.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 2afcc1b1e200..53d1c13a3e39 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -18,6 +18,8 @@ import tvm from tvm import autotvm from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + from .util import get_fp32_len from ..util import get_const_tuple from ..nn.pad import pad @@ -27,8 +29,6 @@ from ..nn.util import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity - def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): @@ -68,18 +68,22 @@ def _fallback_schedule(cfg, wkl): G = wkl.groups KPG = wkl.out_filter // G CPG = wkl.in_filter // G - oc_bn = 1 + oc_bn = 1 for bn in range(simd_width, 0, -1): if KPG % bn == 0: oc_bn = bn break + if oc_bn > KPG: + oc_bn = KPG ic_bn = 1 for bn in range(oc_bn, 0, -1): if CPG % bn == 0: ic_bn = bn break + if ic_bn > CPG: + ic_bn = CPG reg_n = 1 for n in range(31, 0, -1): @@ -139,7 +143,7 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, # If no config was set, we can fallback to default config. if cfg.is_fallback: _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype), - te.placeholder((N, CI // G, KH, KW), + te.placeholder((CO, CI // G, KH, KW), dtype=kernel.dtype), strides, padding, groups, out_dtype) From e22adbe921c71b8632490d4e4aba5aabdb390663 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 28 Aug 2020 13:19:23 +0100 Subject: [PATCH 05/28] Added asymmetric padding support Fixed linting --- python/tvm/topi/arm_cpu/group_conv2d.py | 109 +++++++++++---------- python/tvm/topi/x86/group_conv2d.py | 121 ++++++++++++------------ 2 files changed, 122 insertions(+), 108 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index d2f93b381f77..6066c7060ca4 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" import tvm from tvm import autotvm @@ -55,30 +56,34 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, data = te.placeholder(static_data_shape, dtype=data.dtype) wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, - layout) + layout, asymmetric_pad=True) _fallback_schedule(cfg, wkl) def _fallback_schedule(cfg, wkl): simd_width = 4 # assume ARM SIMD Width is 4 - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 - G = wkl.groups - KPG = wkl.out_filter // G - CPG = wkl.in_filter // G - oc_bn = 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + stride_w = wkl.wstride + out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1 + groups = wkl.groups + kernels_per_group = wkl.out_filter // groups + kernel_depth = wkl.in_filter // groups + oc_bn = 1 for bn in range(simd_width, 0, -1): - if KPG % bn == 0: + if kernels_per_group % bn == 0: oc_bn = bn break + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if CPG % bn == 0: + if kernel_depth % bn == 0: ic_bn = bn break + if ic_bn > kernel_depth: + ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -103,42 +108,45 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - HPAD, WPAD = padding, padding + pt, pl, pb, pr = padding, padding, padding, padding elif len(padding) == 2: HPAD, WPAD = padding + pt, pb = HPAD, HPAD + pl, pr = WPAD, WPAD else: - HPAD, _, WPAD, _ = padding + pt, pl, pb, pr = padding + + HPAD = pt + pb + WPAD = pl + pr assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - HSTR, WSTR = strides, strides + stride_h, stride_w = strides, strides else: - HSTR, WSTR = strides + stride_h, stride_w = strides - N, CI, IH, IW = get_const_tuple(data.shape) - CO, CIG, KH, KW = get_const_tuple(kernel.shape) + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = IH + 2 * HPAD - pad_width = IW + 2 * WPAD + pad_height = in_height + pt + pb + pad_width = in_width + pl + pr - dilated_kernel_h = (KH - 1) * dilation_h + 1 - dilated_kernel_w = (KW - 1) * dilation_w + 1 - OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 - OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 - G = groups - KPG = CO // G - CPG = CI // G + kernels_per_group = out_channel // groups - cfg.define_split("tile_ic", CI, num_outputs=2) - cfg.define_split("tile_oc", CO, num_outputs=2) - cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype), - te.placeholder((N, CI // G, KH, KW), + _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + te.placeholder((out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype), strides, padding, groups, out_dtype) @@ -147,43 +155,46 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, # pack data DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: - data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + data_pad = pad(data, + (0, 0, pt, pl), + (0, 0, pb, pr), + name="data_pad") else: data_pad = data - shape = (G, N, CPG // ic_bn, + shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width) data_vec = te.compute(shape, lambda g, n, C, h, c, w: - data_pad[n, C * ic_bn + c + CPG * g, h, w], + data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], name='data_vec') # pack kernel - shape = (G, KPG//oc_bn, CPG//ic_bn, - KH, KW, ic_bn, oc_bn) + shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, + k_height, k_width, ic_bn, oc_bn) kernel_vec = te.compute(shape, - lambda g, CO, CI, h, w, ci, co: - kernel[(CO * oc_bn + co + g * KPG), - CI * ic_bn + ci, h, w], + lambda g, out_channel, in_channel, h, w, ci, co: + kernel[(out_channel * oc_bn + co + g * kernels_per_group), + in_channel * ic_bn + ci, h, w], name='kernel_vec') # convolution - oshape = (G, N, KPG//oc_bn, - OH, OW, oc_bn) - unpack_shape = (N, CO, OH, OW) + oshape = (groups, batch_size, kernels_per_group//oc_bn, + out_height, out_width, oc_bn) + unpack_shape = (batch_size, out_channel, out_height, out_width) - ic = te.reduce_axis((0, (CPG)), name='ic') - kh = te.reduce_axis((0, KH), name='kh') - kw = te.reduce_axis((0, KW), name='kw') + ic = te.reduce_axis((0, (kernel_depth)), name='ic') + kh = te.reduce_axis((0, k_height), name='kh') + kw = te.reduce_axis((0, k_width), name='kw') idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(data_vec[g, n, idxdiv(ic, ic_bn), - oh*HSTR+kh*dilation_h, + oh * stride_h + kh * dilation_h, idxmod(ic, ic_bn), - ow*WSTR+kw*dilation_w].astype(out_dtype) * + ow * stride_w + kw * dilation_w].astype(out_dtype) * kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(out_dtype), @@ -191,10 +202,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, unpack = te.compute(unpack_shape, lambda n, c, h, w: - conv[idxdiv(c, KPG), n, - idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)), + conv[idxdiv(c, kernels_per_group), n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), h, w, - idxmod(idxmod(c, oc_bn), KPG)] + idxmod(idxmod(c, oc_bn), kernels_per_group)] .astype(out_dtype), name='output_unpack', tag='group_conv2d_nchw') diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 53d1c13a3e39..e56d2db912f8 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" import tvm from tvm import autotvm @@ -56,34 +57,34 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, data = te.placeholder(static_data_shape, dtype=data.dtype) wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, - layout) + layout, asymmetric_pad=True) _fallback_schedule(cfg, wkl) def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 - G = wkl.groups - KPG = wkl.out_filter // G - CPG = wkl.in_filter // G + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + stride_w = wkl.wstride + out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1 + groups = wkl.groups + kernels_per_group = wkl.out_filter // groups + kernel_depth = wkl.in_filter // groups oc_bn = 1 for bn in range(simd_width, 0, -1): - if KPG % bn == 0: + if kernels_per_group % bn == 0: oc_bn = bn break - if oc_bn > KPG: - oc_bn = KPG + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if CPG % bn == 0: + if kernel_depth % bn == 0: ic_bn = bn break - if ic_bn > CPG: - ic_bn = CPG + if ic_bn > kernel_depth: + ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -108,42 +109,45 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - HPAD, WPAD = padding, padding + pt, pl, pb, pr = padding, padding, padding, padding elif len(padding) == 2: HPAD, WPAD = padding + pt, pb = HPAD, HPAD + pl, pr = WPAD, WPAD else: - HPAD, _, WPAD, _ = padding + pt, pl, pb, pr = padding + + HPAD = pt + pb + WPAD = pl + pr assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - HSTR, WSTR = strides, strides + stride_h, stride_w = strides, strides else: - HSTR, WSTR = strides + stride_h, stride_w = strides - N, CI, IH, IW = get_const_tuple(data.shape) - CO, CIG, KH, KW = get_const_tuple(kernel.shape) + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = IH + 2 * HPAD - pad_width = IW + 2 * WPAD + pad_height = in_height + pt + pb + pad_width = in_width + pl + pr - dilated_kernel_h = (KH - 1) * dilation_h + 1 - dilated_kernel_w = (KW - 1) * dilation_w + 1 - OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 - OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 - G = groups - KPG = CO // G - CPG = CI // G + kernels_per_group = out_channel // groups - cfg.define_split("tile_ic", CI, num_outputs=2) - cfg.define_split("tile_oc", CO, num_outputs=2) - cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype), - te.placeholder((CO, CI // G, KH, KW), + _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + te.placeholder((out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype), strides, padding, groups, out_dtype) @@ -152,43 +156,46 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, # pack data DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: - data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") + data_pad = pad(data, + (0, 0, pt, pl), + (0, 0, pb, pr), + name="data_pad") else: data_pad = data - shape = (G, N, CPG // ic_bn, + shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width) data_vec = te.compute(shape, lambda g, n, C, h, c, w: - data_pad[n, C * ic_bn + c + CPG * g, h, w], + data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], name='data_vec') # pack kernel - shape = (G, KPG//oc_bn, CPG//ic_bn, - KH, KW, ic_bn, oc_bn) + shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, + k_height, k_width, ic_bn, oc_bn) kernel_vec = te.compute(shape, - lambda g, CO, CI, h, w, ci, co: - kernel[(CO * oc_bn + co + g * KPG), - CI * ic_bn + ci, h, w], + lambda g, out_channel, in_channel, h, w, ci, co: + kernel[(out_channel * oc_bn + co + g * kernels_per_group), + in_channel * ic_bn + ci, h, w], name='kernel_vec') # convolution - oshape = (G, N, KPG//oc_bn, - OH, OW, oc_bn) - unpack_shape = (N, CO, OH, OW) + oshape = (groups, batch_size, kernels_per_group//oc_bn, + out_height, out_width, oc_bn) + unpack_shape = (batch_size, out_channel, out_height, out_width) - ic = te.reduce_axis((0, (CPG)), name='ic') - kh = te.reduce_axis((0, KH), name='kh') - kw = te.reduce_axis((0, KW), name='kw') + ic = te.reduce_axis((0, (kernel_depth)), name='ic') + kh = te.reduce_axis((0, k_height), name='kh') + kw = te.reduce_axis((0, k_width), name='kw') idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(data_vec[g, n, idxdiv(ic, ic_bn), - oh*HSTR+kh*dilation_h, + oh * stride_h + kh * dilation_h, idxmod(ic, ic_bn), - ow*WSTR+kw*dilation_w].astype(out_dtype) * + ow * stride_w + kw * dilation_w].astype(out_dtype) * kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(out_dtype), @@ -196,10 +203,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, unpack = te.compute(unpack_shape, lambda n, c, h, w: - conv[idxdiv(c, KPG), n, - idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)), + conv[idxdiv(c, kernels_per_group), n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), h, w, - idxmod(idxmod(c, oc_bn), KPG)] + idxmod(idxmod(c, oc_bn), kernels_per_group)] .astype(out_dtype), name='output_unpack', tag='group_conv2d_nchw') @@ -239,9 +246,6 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] - _, c, h, w = get_const_tuple(data.shape) - _, x, kh, kw = get_const_tuple(kernel.shape) - args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] schedule_conv_sp_grouped(*args) @@ -253,8 +257,7 @@ def traverse(op): def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last, - **kwargs): + conv_out, output, last): # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) @@ -264,13 +267,13 @@ def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec, HPAD, WPAD = padding DOPAD = (HPAD != 0 or WPAD != 0) - A, W = data, kernel_vec + _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data if DOPAD: s[A0].compute_inline() - groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) s[A1].parallel(parallel_axis) From d7f6c2b06a6fe9523654bb02a21f550a4b237dc7 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 28 Aug 2020 13:44:47 +0100 Subject: [PATCH 06/28] Improve linting --- python/tvm/topi/arm_cpu/group_conv2d.py | 64 +++++++++++++------------ python/tvm/topi/x86/group_conv2d.py | 52 ++++++++++---------- 2 files changed, 61 insertions(+), 55 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 6066c7060ca4..4f7c29005804 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -19,15 +19,16 @@ import tvm from tvm import autotvm from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + from ..util import get_const_tuple from ..nn.pad import pad from .. import tag -from ..nn.conv2d import group_conv2d_nchw from ..nn.util import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, @@ -62,9 +63,9 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, def _fallback_schedule(cfg, wkl): simd_width = 4 # assume ARM SIMD Width is 4 - pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr stride_w = wkl.wstride - out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1 + out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 groups = wkl.groups kernels_per_group = wkl.out_filter // groups kernel_depth = wkl.in_filter // groups @@ -100,6 +101,10 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu") def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'): + """ + Compute group conv2d with NCHW layout, using GSPC algorithm. + https://arxiv.org/abs/2006.09791 + """ assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(dilation, int): dilation_h, dilation_w = dilation, dilation @@ -108,16 +113,16 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pt, pl, pb, pr = padding, padding, padding, padding + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding elif len(padding) == 2: - HPAD, WPAD = padding - pt, pb = HPAD, HPAD - pl, pr = WPAD, WPAD + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad else: - pt, pl, pb, pr = padding + pad_top, pad_left, pad_bottom, pad_right = padding - HPAD = pt + pb - WPAD = pl + pr + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): @@ -128,13 +133,13 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = in_height + pt + pb - pad_width = in_width + pl + pr + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right dilated_kernel_h = (k_height - 1) * dilation_h + 1 dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 kernels_per_group = out_channel // groups @@ -153,11 +158,11 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, oc_bn = cfg['tile_oc'].size[-1] ic_bn = cfg['tile_ic'].size[-1] # pack data - DOPAD = (HPAD != 0 or WPAD != 0) - if DOPAD: + do_pad = (hpad != 0 or wpad != 0) + if do_pad: data_pad = pad(data, - (0, 0, pt, pl), - (0, 0, pb, pr), + (0, 0, pad_top, pad_left), + (0, 0, pad_bottom, pad_right), name="data_pad") else: data_pad = data @@ -245,12 +250,9 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] - _, c, h, w = get_const_tuple(data.shape) - _, x, kh, kw = get_const_tuple(kernel.shape) - args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] - schedule_conv_sp_grouped(*args) + _schedule_gspc_nchw(*args) scheduled_ops.append(op) @@ -258,25 +260,25 @@ def traverse(op): return s -def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last, - **kwargs): +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, + conv_out, output, last): + """Schedule GSPC""" # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) # no stride and padding info here padding = infer_pad(data, data_pad) - HPAD, WPAD = padding - DOPAD = (HPAD != 0 or WPAD != 0) + hpad, wpad = padding + do_pad = (hpad != 0 or wpad != 0) - A, W = data, kernel_vec + _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data - if DOPAD: + if do_pad: s[A0].compute_inline() - groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) s[A1].parallel(parallel_axis) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index e56d2db912f8..0f8426296cc9 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -26,7 +26,6 @@ from ..nn.pad import pad from .. import tag -from ..nn.conv2d import group_conv2d_nchw from ..nn.util import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload @@ -63,9 +62,9 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr stride_w = wkl.wstride - out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1 + out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 groups = wkl.groups kernels_per_group = wkl.out_filter // groups kernel_depth = wkl.in_filter // groups @@ -101,6 +100,10 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.x86") def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'): + """ + Compute group conv2d with NCHW layout, using GSPC algorithm. + https://arxiv.org/abs/2006.09791 + """ assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(dilation, int): dilation_h, dilation_w = dilation, dilation @@ -109,16 +112,16 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pt, pl, pb, pr = padding, padding, padding, padding + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding elif len(padding) == 2: - HPAD, WPAD = padding - pt, pb = HPAD, HPAD - pl, pr = WPAD, WPAD + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad else: - pt, pl, pb, pr = padding + pad_top, pad_left, pad_bottom, pad_right = padding - HPAD = pt + pb - WPAD = pl + pr + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): @@ -129,13 +132,13 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = in_height + pt + pb - pad_width = in_width + pl + pr + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right dilated_kernel_h = (k_height - 1) * dilation_h + 1 dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 kernels_per_group = out_channel // groups @@ -154,11 +157,11 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, oc_bn = cfg['tile_oc'].size[-1] ic_bn = cfg['tile_ic'].size[-1] # pack data - DOPAD = (HPAD != 0 or WPAD != 0) - if DOPAD: + do_pad = (hpad != 0 or wpad != 0) + if do_pad: data_pad = pad(data, - (0, 0, pt, pl), - (0, 0, pb, pr), + (0, 0, pad_top, pad_left), + (0, 0, pad_bottom, pad_right), name="data_pad") else: data_pad = data @@ -248,7 +251,7 @@ def traverse(op): args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] - schedule_conv_sp_grouped(*args) + _schedule_gspc_nchw(*args) scheduled_ops.append(op) @@ -256,22 +259,23 @@ def traverse(op): return s -def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last): +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, + conv_out, output, last): + """Schedule GSPC""" # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) # no stride and padding info here padding = infer_pad(data, data_pad) - HPAD, WPAD = padding - DOPAD = (HPAD != 0 or WPAD != 0) + hpad, wpad = padding + do_pad = (hpad != 0 or wpad != 0) _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data - if DOPAD: + if do_pad: s[A0].compute_inline() groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis From 323f8e5e80f63322a02f5d760a046a8eefe6b092 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Mon, 31 Aug 2020 09:51:13 +0100 Subject: [PATCH 07/28] Better linting, disable final linting checks --- python/tvm/topi/arm_cpu/group_conv2d.py | 16 ++++++++------- python/tvm/topi/x86/group_conv2d.py | 26 ++++++++++++++++--------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 4f7c29005804..db56ae0cdab9 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel """Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" import tvm @@ -43,8 +45,8 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, - layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, + out_dtype, layout='NCHW'): """ Get default schedule config for the workload """ @@ -63,7 +65,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, def _fallback_schedule(cfg, wkl): simd_width = 4 # assume ARM SIMD Width is 4 - pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr + pad_left, pad_right = wkl.padl, wkl.padr stride_w = wkl.wstride out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 groups = wkl.groups @@ -158,8 +160,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, oc_bn = cfg['tile_oc'].size[-1] ic_bn = cfg['tile_ic'].size[-1] # pack data - do_pad = (hpad != 0 or wpad != 0) - if do_pad: + DOPAD = (hpad != 0 or wpad != 0) + if DOPAD: data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), @@ -270,13 +272,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, # no stride and padding info here padding = infer_pad(data, data_pad) hpad, wpad = padding - do_pad = (hpad != 0 or wpad != 0) + DOPAD = (hpad != 0 or wpad != 0) _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data - if do_pad: + if DOPAD: s[A0].compute_inline() groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 0f8426296cc9..377e480482fd 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel """Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" import tvm @@ -42,8 +44,8 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, - layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, + out_dtype, layout='NCHW'): """ Get default schedule config for the workload """ @@ -62,7 +64,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr + pad_left, pad_right = wkl.padl, wkl.padr stride_w = wkl.wstride out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 groups = wkl.groups @@ -149,7 +151,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), + dtype=data.dtype), te.placeholder((out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype), strides, padding, groups, out_dtype) @@ -157,8 +160,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, oc_bn = cfg['tile_oc'].size[-1] ic_bn = cfg['tile_ic'].size[-1] # pack data - do_pad = (hpad != 0 or wpad != 0) - if do_pad: + DOPAD = (hpad != 0 or wpad != 0) + if DOPAD: data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), @@ -260,7 +263,7 @@ def traverse(op): def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last): + conv_out, output, last): """Schedule GSPC""" # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], @@ -269,14 +272,16 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, # no stride and padding info here padding = infer_pad(data, data_pad) hpad, wpad = padding - do_pad = (hpad != 0 or wpad != 0) + DOPAD = (hpad != 0 or wpad != 0) _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data - if do_pad: + if DOPAD: s[A0].compute_inline() + # s[A0].compute_at(s[A1]) + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) @@ -319,8 +324,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) + # s[A1].compute_at(CC, parallel_axis) s[CC].parallel(parallel_axis) + + s[CC].vectorize(oc_block) s[CC].unroll(ow_block) From 55ad3d72b01135b673b8f808540c551dea0bece9 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 3 Sep 2020 19:56:36 +0100 Subject: [PATCH 08/28] Fixed final linting errors (figured out how to run lint tests locally) --- python/tvm/topi/arm_cpu/group_conv2d.py | 4 ++-- python/tvm/topi/x86/group_conv2d.py | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index db56ae0cdab9..f62d330cab30 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -152,7 +152,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), + dtype=data.dtype), te.placeholder((out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype), strides, padding, groups, out_dtype) @@ -265,7 +266,6 @@ def traverse(op): def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): """Schedule GSPC""" - # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 377e480482fd..ffef5099b304 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -265,7 +265,6 @@ def traverse(op): def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): """Schedule GSPC""" - # fetch schedule ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) @@ -280,7 +279,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, # schedule data if DOPAD: s[A0].compute_inline() - # s[A0].compute_at(s[A1]) groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis @@ -324,10 +322,8 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) - # s[A1].compute_at(CC, parallel_axis) - s[CC].parallel(parallel_axis) - + s[CC].parallel(parallel_axis) s[CC].vectorize(oc_block) From 0722844a6b53d35aa7db13923116edd090566de5 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Mon, 21 Dec 2020 19:31:13 +0000 Subject: [PATCH 09/28] fixing linter formatting part 1 --- python/tvm/topi/arm_cpu/group_conv2d.py | 179 ++++++++++++++---------- python/tvm/topi/x86/group_conv2d.py | 172 +++++++++++++---------- 2 files changed, 205 insertions(+), 146 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index f62d330cab30..7de0dfa300b8 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -32,12 +32,11 @@ - -def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, - out_dtype): +def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): """Compute group_conv2d with NCHW layout""" - return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding, - dilation, groups, out_dtype) + return group_conv2d_nchw_spatial_pack( + data, kernel, strides, padding, dilation, groups, out_dtype + ) def schedule_group_conv2d_nchw(outs): @@ -45,8 +44,7 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, - out_dtype, layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'): """ Get default schedule config for the workload """ @@ -58,13 +56,12 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, static_data_shape.append(dim) data = te.placeholder(static_data_shape, dtype=data.dtype) - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, - layout, asymmetric_pad=True) + wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) _fallback_schedule(cfg, wkl) def _fallback_schedule(cfg, wkl): - simd_width = 4 # assume ARM SIMD Width is 4 + simd_width = 4 # assume ARM SIMD Width is 4 pad_left, pad_right = wkl.padl, wkl.padr stride_w = wkl.wstride out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 @@ -101,8 +98,9 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu") -def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, - dilation, groups, out_dtype='float32'): +def group_conv2d_nchw_spatial_pack( + cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32' +): """ Compute group conv2d with NCHW layout, using GSPC algorithm. https://arxiv.org/abs/2006.09791 @@ -152,71 +150,101 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), - dtype=data.dtype), - te.placeholder((out_channel, in_channel // groups, k_height, k_width), - dtype=kernel.dtype), - strides, padding, groups, out_dtype) - - oc_bn = cfg['tile_oc'].size[-1] - ic_bn = cfg['tile_ic'].size[-1] + _get_default_config( + cfg, + te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + te.placeholder( + (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype + ), + strides, + padding, + groups, + out_dtype, + ) + + oc_bn = cfg["tile_oc"].size[-1] + ic_bn = cfg["tile_ic"].size[-1] + # pack data - DOPAD = (hpad != 0 or wpad != 0) + DOPAD = hpad != 0 or wpad != 0 if DOPAD: - data_pad = pad(data, - (0, 0, pad_top, pad_left), - (0, 0, pad_bottom, pad_right), - name="data_pad") + data_pad = pad( + data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad" + ) else: data_pad = data - shape = (groups, batch_size, kernel_depth // ic_bn, - pad_height, ic_bn, pad_width) + shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width) - data_vec = te.compute(shape, - lambda g, n, C, h, c, w: - data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], - name='data_vec') + data_vec = te.compute( + shape, + lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + name="data_vec", + ) # pack kernel - shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, - k_height, k_width, ic_bn, oc_bn) - kernel_vec = te.compute(shape, - lambda g, out_channel, in_channel, h, w, ci, co: - kernel[(out_channel * oc_bn + co + g * kernels_per_group), - in_channel * ic_bn + ci, h, w], - name='kernel_vec') + shape = ( + groups, + kernels_per_group // oc_bn, + kernel_depth // ic_bn, + k_height, + k_width, + ic_bn, + oc_bn, + ) + + kernel_vec = te.compute( + shape, + lambda g, out_channel, in_channel, h, w, ci, co: kernel[ + (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w + ], + name="kernel_vec", + ) # convolution - oshape = (groups, batch_size, kernels_per_group//oc_bn, - out_height, out_width, oc_bn) + oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, out_channel, out_height, out_width) - ic = te.reduce_axis((0, (kernel_depth)), name='ic') - kh = te.reduce_axis((0, k_height), name='kh') - kw = te.reduce_axis((0, k_width), name='kw') + ic = te.reduce_axis((0, (kernel_depth)), name="ic") + kh = te.reduce_axis((0, k_height), name="kh") + kw = te.reduce_axis((0, k_width), name="kw") + idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv - conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: - te.sum(data_vec[g, n, idxdiv(ic, ic_bn), - oh * stride_h + kh * dilation_h, - idxmod(ic, ic_bn), - ow * stride_w + kw * dilation_w].astype(out_dtype) * - kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), - kh, kw, idxmod(ic, ic_bn), - oc_block].astype(out_dtype), - axis=[ic, kh, kw]), name='conv') - - unpack = te.compute(unpack_shape, - lambda n, c, h, w: - conv[idxdiv(c, kernels_per_group), n, - idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), - h, w, - idxmod(idxmod(c, oc_bn), kernels_per_group)] - .astype(out_dtype), - name='output_unpack', - tag='group_conv2d_nchw') + conv = te.compute( + oshape, + lambda g, n, oc_chunk, oh, ow, oc_block: te.sum( + data_vec[ + g, + n, + idxdiv(ic, ic_bn), + oh * stride_h + kh * dilation_h, + idxmod(ic, ic_bn), + ow * stride_w + kw * dilation_w, + ].astype(out_dtype) + * kernel_vec[ + g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block + ].astype(out_dtype), + axis=[ic, kh, kw], + ), + name="conv", + ) + + unpack = te.compute( + unpack_shape, + lambda n, c, h, w: conv[ + idxdiv(c, kernels_per_group), + n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + h, + w, + idxmod(idxmod(c, oc_bn), kernels_per_group), + ].astype(out_dtype), + name="output_unpack", + tag="group_conv2d_nchw", + ) + return unpack @@ -236,7 +264,7 @@ def traverse(op): if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops: traverse(tensor.op) - if 'group_conv2d_nchw' in op.tag: + if "group_conv2d_nchw" in op.tag: output = op.output(0) if "tile_ic" not in cfg: @@ -253,8 +281,7 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] - args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, - output, outs[0]] + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] _schedule_gspc_nchw(*args) scheduled_ops.append(op) @@ -263,16 +290,20 @@ def traverse(op): return s -def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last): +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): """Schedule GSPC""" - ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], - cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) + ic_bn, oc_bn, reg_n, unroll_kw = ( + cfg["tile_ic"].size[-1], + cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1], + cfg["unroll_kw"].val, + ) + # no stride and padding info here padding = infer_pad(data, data_pad) hpad, wpad = padding - DOPAD = (hpad != 0 or wpad != 0) + DOPAD = hpad != 0 or wpad != 0 _, W = data, kernel_vec A0, A1 = data_pad, data_vec @@ -297,7 +328,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, # schedule conv C, O0, O = conv_out, output, last - CC = s.cache_write(C, 'global') + CC = s.cache_write(C, "global") _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis @@ -314,12 +345,10 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) if unroll_kw: - s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, - ow_block, oc_block) + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) s[CC].unroll(kw) else: - s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, - ow_block, oc_block) + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) s[CC].parallel(parallel_axis) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index ffef5099b304..c6b808ac9534 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -32,11 +32,11 @@ from ..nn.conv2d import _get_workload as _get_conv2d_workload -def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, - out_dtype): +def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): """Compute group_conv2d with NCHW layout""" - return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding, - dilation, groups, out_dtype) + return group_conv2d_nchw_spatial_pack( + data, kernel, strides, padding, dilation, groups, out_dtype + ) def schedule_group_conv2d_nchw(outs): @@ -44,8 +44,7 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, - out_dtype, layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'): """ Get default schedule config for the workload """ @@ -57,8 +56,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, static_data_shape.append(dim) data = te.placeholder(static_data_shape, dtype=data.dtype) - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, - layout, asymmetric_pad=True) + wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) _fallback_schedule(cfg, wkl) @@ -100,8 +98,9 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.x86") -def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, - dilation, groups, out_dtype='float32'): +def group_conv2d_nchw_spatial_pack( + cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32' +): """ Compute group conv2d with NCHW layout, using GSPC algorithm. https://arxiv.org/abs/2006.09791 @@ -151,71 +150,100 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, # If no config was set, we can fallback to default config. if cfg.is_fallback: - _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), - dtype=data.dtype), - te.placeholder((out_channel, in_channel // groups, k_height, k_width), - dtype=kernel.dtype), - strides, padding, groups, out_dtype) - - oc_bn = cfg['tile_oc'].size[-1] - ic_bn = cfg['tile_ic'].size[-1] + _get_default_config( + cfg, + te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + te.placeholder( + (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype + ), + strides, + padding, + groups, + out_dtype, + ) + + oc_bn = cfg["tile_oc"].size[-1] + ic_bn = cfg["tile_ic"].size[-1] + # pack data - DOPAD = (hpad != 0 or wpad != 0) + DOPAD = hpad != 0 or wpad != 0 if DOPAD: - data_pad = pad(data, - (0, 0, pad_top, pad_left), - (0, 0, pad_bottom, pad_right), - name="data_pad") + data_pad = pad( + data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad" + ) else: data_pad = data - shape = (groups, batch_size, kernel_depth // ic_bn, - pad_height, ic_bn, pad_width) + shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width) - data_vec = te.compute(shape, - lambda g, n, C, h, c, w: - data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], - name='data_vec') + data_vec = te.compute( + shape, + lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + name="data_vec", + ) # pack kernel - shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn, - k_height, k_width, ic_bn, oc_bn) - kernel_vec = te.compute(shape, - lambda g, out_channel, in_channel, h, w, ci, co: - kernel[(out_channel * oc_bn + co + g * kernels_per_group), - in_channel * ic_bn + ci, h, w], - name='kernel_vec') + shape = ( + groups, + kernels_per_group // oc_bn, + kernel_depth // ic_bn, + k_height, + k_width, + ic_bn, + oc_bn, + ) + + kernel_vec = te.compute( + shape, + lambda g, out_channel, in_channel, h, w, ci, co: kernel[ + (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w + ], + name="kernel_vec", + ) # convolution - oshape = (groups, batch_size, kernels_per_group//oc_bn, - out_height, out_width, oc_bn) + oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, out_channel, out_height, out_width) - ic = te.reduce_axis((0, (kernel_depth)), name='ic') - kh = te.reduce_axis((0, k_height), name='kh') - kw = te.reduce_axis((0, k_width), name='kw') + ic = te.reduce_axis((0, (kernel_depth)), name="ic") + kh = te.reduce_axis((0, k_height), name="kh") + kw = te.reduce_axis((0, k_width), name="kw") + idxmod = tvm.tir.indexmod idxdiv = tvm.tir.indexdiv + conv = te.compute( + oshape, + lambda g, n, oc_chunk, oh, ow, oc_block: te.sum( + data_vec[ + g, + n, + idxdiv(ic, ic_bn), + oh * stride_h + kh * dilation_h, + idxmod(ic, ic_bn), + ow * stride_w + kw * dilation_w, + ].astype(out_dtype) + * kernel_vec[ + g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block + ].astype(out_dtype), + axis=[ic, kh, kw], + ), + name="conv", + ) + + unpack = te.compute( + unpack_shape, + lambda n, c, h, w: conv[ + idxdiv(c, kernels_per_group), + n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + h, + w, + idxmod(idxmod(c, oc_bn), kernels_per_group), + ].astype(out_dtype), + name="output_unpack", + tag="group_conv2d_nchw", + ) - conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block: - te.sum(data_vec[g, n, idxdiv(ic, ic_bn), - oh * stride_h + kh * dilation_h, - idxmod(ic, ic_bn), - ow * stride_w + kw * dilation_w].astype(out_dtype) * - kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn), - kh, kw, idxmod(ic, ic_bn), - oc_block].astype(out_dtype), - axis=[ic, kh, kw]), name='conv') - - unpack = te.compute(unpack_shape, - lambda n, c, h, w: - conv[idxdiv(c, kernels_per_group), n, - idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), - h, w, - idxmod(idxmod(c, oc_bn), kernels_per_group)] - .astype(out_dtype), - name='output_unpack', - tag='group_conv2d_nchw') return unpack @@ -235,7 +263,7 @@ def traverse(op): if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops: traverse(tensor.op) - if 'group_conv2d_nchw' in op.tag: + if "group_conv2d_nchw" in op.tag: output = op.output(0) if "tile_ic" not in cfg: @@ -252,8 +280,7 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] - args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, - output, outs[0]] + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] _schedule_gspc_nchw(*args) scheduled_ops.append(op) @@ -265,13 +292,18 @@ def traverse(op): def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): """Schedule GSPC""" - ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], - cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) + ic_bn, oc_bn, reg_n, unroll_kw = ( + cfg["tile_ic"].size[-1], + cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1], + cfg["unroll_kw"].val, + ) + # no stride and padding info here padding = infer_pad(data, data_pad) hpad, wpad = padding - DOPAD = (hpad != 0 or wpad != 0) + DOPAD = hpad != 0 or wpad != 0 _, W = data, kernel_vec A0, A1 = data_pad, data_vec @@ -297,7 +329,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, # schedule conv C, O0, O = conv_out, output, last - CC = s.cache_write(C, 'global') + CC = s.cache_write(C, "global") _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis @@ -314,12 +346,10 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) if unroll_kw: - s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, - ow_block, oc_block) + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) s[CC].unroll(kw) else: - s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, - ow_block, oc_block) + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) From c9460efe4a55d7d60555d5f613ce1a5d0797f32b Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Mon, 21 Dec 2020 19:40:24 +0000 Subject: [PATCH 10/28] fixing linter formatting part 2 --- python/tvm/relay/op/strategy/arm_cpu.py | 2 +- python/tvm/topi/x86/group_conv2d.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 36200bb024a9..005eae68b8b7 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -210,7 +210,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw), - name="group_conv2d_nchw.arm_cpu" + name="group_conv2d_nchw.arm_cpu", ) elif layout == "NHWC": assert kernel_layout == "HWIO" diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index c6b808ac9534..790f15a2949d 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -44,7 +44,7 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"): """ Get default schedule config for the workload """ @@ -99,7 +99,7 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.x86") def group_conv2d_nchw_spatial_pack( - cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32' + cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32" ): """ Compute group conv2d with NCHW layout, using GSPC algorithm. @@ -202,7 +202,7 @@ def group_conv2d_nchw_spatial_pack( ) # convolution - oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn) + oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, out_channel, out_height, out_width) ic = te.reduce_axis((0, (kernel_depth)), name="ic") @@ -299,7 +299,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, cfg["unroll_kw"].val, ) - # no stride and padding info here padding = infer_pad(data, data_pad) hpad, wpad = padding From b41a8687bc2e9c992afd52e4539d9c5635b7a169 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Mon, 21 Dec 2020 19:45:41 +0000 Subject: [PATCH 11/28] fixing linter formatting part 3 --- python/tvm/topi/arm_cpu/group_conv2d.py | 8 +++----- python/tvm/topi/x86/group_conv2d.py | 5 ++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 7de0dfa300b8..a8fb4c6920a9 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -31,7 +31,6 @@ from ..nn.conv2d import _get_workload as _get_conv2d_workload - def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): """Compute group_conv2d with NCHW layout""" return group_conv2d_nchw_spatial_pack( @@ -44,7 +43,7 @@ def schedule_group_conv2d_nchw(outs): return schedule_group_conv2d_nchwc(outs) -def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'): +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"): """ Get default schedule config for the workload """ @@ -99,7 +98,7 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu") def group_conv2d_nchw_spatial_pack( - cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32' + cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32" ): """ Compute group conv2d with NCHW layout, using GSPC algorithm. @@ -202,7 +201,7 @@ def group_conv2d_nchw_spatial_pack( ) # convolution - oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn) + oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, out_channel, out_height, out_width) ic = te.reduce_axis((0, (kernel_depth)), name="ic") @@ -299,7 +298,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, cfg["unroll_kw"].val, ) - # no stride and padding info here padding = infer_pad(data, data_pad) hpad, wpad = padding diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 790f15a2949d..8d946ba0dd98 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -99,7 +99,7 @@ def _fallback_schedule(cfg, wkl): @autotvm.register_topi_compute("group_conv2d_nchw.x86") def group_conv2d_nchw_spatial_pack( - cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32" + cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32" ): """ Compute group conv2d with NCHW layout, using GSPC algorithm. @@ -289,8 +289,7 @@ def traverse(op): return s -def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, - conv_out, output, last): +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): """Schedule GSPC""" ic_bn, oc_bn, reg_n, unroll_kw = ( cfg["tile_ic"].size[-1], From 1adf616183cc2a3c333f99698b9d02b0aeaaf84e Mon Sep 17 00:00:00 2001 From: Wheest Date: Mon, 21 Dec 2020 20:06:54 +0000 Subject: [PATCH 12/28] Update conv2d.py Fixed merge issue --- python/tvm/topi/nn/conv2d.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 80f87f86736c..a00df1fdbf0f 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -174,10 +174,14 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou else: KH, KW, CIG, CO = get_const_tuple(kernel.shape) +<<<<<<< HEAD pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) dilation_h, dilation_w = ( dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) ) +======= + HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) +>>>>>>> c57bd780a (Update conv2d.py) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride From 0a26e5c39a76365279741411dbc355d206ce5feb Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Wed, 6 Jan 2021 18:25:53 +0000 Subject: [PATCH 13/28] Rebase, and update responding to some comments --- python/tvm/relay/op/strategy/x86.py | 10 +++++----- python/tvm/topi/arm_cpu/group_conv2d.py | 8 ++++---- python/tvm/topi/nn/conv2d.py | 4 ---- python/tvm/topi/x86/group_conv2d.py | 10 +++++----- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 60bd92ef63d1..98eb28ee0ef3 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("group_conv2d is not optimized for x86 with autotvm.") - strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), - name="group_conv2d_nhwc.generic", - ) + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), + name="group_conv2d_nhwc.generic", + ) else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index a8fb4c6920a9..99f4d5c946d6 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -23,11 +23,11 @@ from tvm import te from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from ..util import get_const_tuple +from ..utils import get_const_tuple from ..nn.pad import pad from .. import tag -from ..nn.util import infer_pad +from ..nn.utils import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload @@ -62,8 +62,8 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, def _fallback_schedule(cfg, wkl): simd_width = 4 # assume ARM SIMD Width is 4 pad_left, pad_right = wkl.padl, wkl.padr - stride_w = wkl.wstride - out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 + stride_w = wkl.stride_w + out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1 groups = wkl.groups kernels_per_group = wkl.out_filter // groups kernel_depth = wkl.in_filter // groups diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index a00df1fdbf0f..80f87f86736c 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -174,14 +174,10 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou else: KH, KW, CIG, CO = get_const_tuple(kernel.shape) -<<<<<<< HEAD pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) dilation_h, dilation_w = ( dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) ) -======= - HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) ->>>>>>> c57bd780a (Update conv2d.py) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 8d946ba0dd98..02dc058b39cc 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -23,12 +23,12 @@ from tvm import te from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from .util import get_fp32_len -from ..util import get_const_tuple +from .utils import get_fp32_len +from ..utils import get_const_tuple from ..nn.pad import pad from .. import tag -from ..nn.util import infer_pad +from ..nn.utils import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload @@ -63,8 +63,8 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pad_left, pad_right = wkl.padl, wkl.padr - stride_w = wkl.wstride - out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1 + stride_w = wkl.stride_w + out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1 groups = wkl.groups kernels_per_group = wkl.out_filter // groups kernel_depth = wkl.in_filter // groups From c79a7cac301409ca63a56a6f1db4b8893c5f95d1 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 18 Mar 2021 15:37:15 +0000 Subject: [PATCH 14/28] Fixed AutoScheduler bug for NHWC case --- python/tvm/relay/op/strategy/x86.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 98eb28ee0ef3..60bd92ef63d1 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("group_conv2d is not optimized for x86 with autotvm.") - strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), - name="group_conv2d_nhwc.generic", - ) + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), + name="group_conv2d_nhwc.generic", + ) else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy From bd3237234d89726d80f5cede7c6ffbf7d6171ab9 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 19 Mar 2021 18:08:16 +0000 Subject: [PATCH 15/28] removed infer_pad from GSPC --- python/tvm/topi/arm_cpu/group_conv2d.py | 9 ++------- python/tvm/topi/x86/group_conv2d.py | 9 +-------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 99f4d5c946d6..024b649d9b23 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -27,7 +27,6 @@ from ..nn.pad import pad from .. import tag -from ..nn.utils import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload @@ -298,17 +297,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, cfg["unroll_kw"].val, ) - # no stride and padding info here - padding = infer_pad(data, data_pad) - hpad, wpad = padding - DOPAD = hpad != 0 or wpad != 0 - _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data - if DOPAD: + if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 02dc058b39cc..160c6dfb9cae 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -28,7 +28,6 @@ from ..nn.pad import pad from .. import tag -from ..nn.utils import infer_pad from ..nn.conv2d import _get_workload as _get_conv2d_workload @@ -275,7 +274,6 @@ def traverse(op): s[kernel].compute_inline() data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] - data_pad = None if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] @@ -298,16 +296,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, cfg["unroll_kw"].val, ) - # no stride and padding info here - padding = infer_pad(data, data_pad) - hpad, wpad = padding - DOPAD = hpad != 0 or wpad != 0 - _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data - if DOPAD: + if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis From a3f9e510d3ef3b16524ce7a8647fd0debe4130f4 Mon Sep 17 00:00:00 2001 From: Wheest Date: Sun, 26 Jul 2020 15:16:05 +0100 Subject: [PATCH 16/28] Rebase, and undoing accidental removal of auto scheduler NHWC support --- python/tvm/topi/arm_cpu/group_conv2d.py | 76 +++++++--------------- python/tvm/topi/x86/group_conv2d.py | 84 +++++++++---------------- 2 files changed, 52 insertions(+), 108 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 024b649d9b23..11bff7ddd5fe 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -1,29 +1,7 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member -# pylint: disable=no-value-for-parameter,import-outside-toplevel -"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" - import tvm from tvm import autotvm from tvm import te -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity - -from ..utils import get_const_tuple +from ..util import get_const_tuple from ..nn.pad import pad from .. import tag @@ -68,20 +46,17 @@ def _fallback_schedule(cfg, wkl): kernel_depth = wkl.in_filter // groups oc_bn = 1 + for bn in range(simd_width, 0, -1): - if kernels_per_group % bn == 0: + if KPG % bn == 0: oc_bn = bn break - if oc_bn > kernels_per_group: - oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if kernel_depth % bn == 0: + if CPG % bn == 0: ic_bn = bn break - if ic_bn > kernel_depth: - ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -111,39 +86,36 @@ def group_conv2d_nchw_spatial_pack( assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + HPAD, WPAD = padding, padding elif len(padding) == 2: - hpad, wpad = padding - pad_top, pad_bottom = hpad, hpad - pad_left, pad_right = wpad, wpad + HPAD, WPAD = padding else: - pad_top, pad_left, pad_bottom, pad_right = padding - - hpad = pad_top + pad_bottom - wpad = pad_left + pad_right + HPAD, _, WPAD, _ = padding assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - stride_h, stride_w = strides, strides + HSTR, WSTR = strides, strides else: - stride_h, stride_w = strides + HSTR, WSTR = strides - batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) - out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + N, CI, IH, IW = get_const_tuple(data.shape) + CO, CIG, KH, KW = get_const_tuple(kernel.shape) - pad_height = in_height + pad_top + pad_bottom - pad_width = in_width + pad_left + pad_right + pad_height = IH + 2 * HPAD + pad_width = IW + 2 * WPAD - dilated_kernel_h = (k_height - 1) * dilation_h + 1 - dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + dilated_kernel_h = (KH - 1) * dilation_h + 1 + dilated_kernel_w = (KW - 1) * dilation_w + 1 + OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 + OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 - kernels_per_group = out_channel // groups + G = groups + KPG = CO // G + CPG = CI // G - cfg.define_split("tile_ic", in_channel, num_outputs=2) - cfg.define_split("tile_oc", out_channel, num_outputs=2) - cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", CI, num_outputs=2) + cfg.define_split("tile_oc", CO, num_outputs=2) + cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. @@ -297,7 +269,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, cfg["unroll_kw"].val, ) - _, W = data, kernel_vec + A, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 160c6dfb9cae..c5a6475ec913 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -1,35 +1,15 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member -# pylint: disable=no-value-for-parameter,import-outside-toplevel -"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" - import tvm from tvm import autotvm from tvm import te -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity - -from .utils import get_fp32_len -from ..utils import get_const_tuple +from .util import get_fp32_len +from ..util import get_const_tuple from ..nn.pad import pad from .. import tag from ..nn.conv2d import _get_workload as _get_conv2d_workload +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): """Compute group_conv2d with NCHW layout""" @@ -69,20 +49,17 @@ def _fallback_schedule(cfg, wkl): kernel_depth = wkl.in_filter // groups oc_bn = 1 + for bn in range(simd_width, 0, -1): - if kernels_per_group % bn == 0: + if KPG % bn == 0: oc_bn = bn break - if oc_bn > kernels_per_group: - oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if kernel_depth % bn == 0: + if CPG % bn == 0: ic_bn = bn break - if ic_bn > kernel_depth: - ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -112,39 +89,36 @@ def group_conv2d_nchw_spatial_pack( assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + HPAD, WPAD = padding, padding elif len(padding) == 2: - hpad, wpad = padding - pad_top, pad_bottom = hpad, hpad - pad_left, pad_right = wpad, wpad + HPAD, WPAD = padding else: - pad_top, pad_left, pad_bottom, pad_right = padding - - hpad = pad_top + pad_bottom - wpad = pad_left + pad_right + HPAD, _, WPAD, _ = padding assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - stride_h, stride_w = strides, strides + HSTR, WSTR = strides, strides else: - stride_h, stride_w = strides + HSTR, WSTR = strides - batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) - out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + N, CI, IH, IW = get_const_tuple(data.shape) + CO, CIG, KH, KW = get_const_tuple(kernel.shape) - pad_height = in_height + pad_top + pad_bottom - pad_width = in_width + pad_left + pad_right + pad_height = IH + 2 * HPAD + pad_width = IW + 2 * WPAD - dilated_kernel_h = (k_height - 1) * dilation_h + 1 - dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + dilated_kernel_h = (KH - 1) * dilation_h + 1 + dilated_kernel_w = (KW - 1) * dilation_w + 1 + OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 + OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 - kernels_per_group = out_channel // groups + G = groups + KPG = CO // G + CPG = CI // G - cfg.define_split("tile_ic", in_channel, num_outputs=2) - cfg.define_split("tile_oc", out_channel, num_outputs=2) - cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", CI, num_outputs=2) + cfg.define_split("tile_oc", CO, num_outputs=2) + cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. @@ -296,14 +270,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, cfg["unroll_kw"].val, ) - _, W = data, kernel_vec + A, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() - - groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis + groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) s[A1].parallel(parallel_axis) @@ -343,7 +316,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) - s[CC].parallel(parallel_axis) s[CC].vectorize(oc_block) From e741ca730f6678a163f5c3ba8ba87b238f6d36c4 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 27 Aug 2020 17:01:36 +0100 Subject: [PATCH 17/28] Added ASF license header --- python/tvm/topi/arm_cpu/group_conv2d.py | 17 +++++++++++++++++ python/tvm/topi/x86/group_conv2d.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 11bff7ddd5fe..de7e16f5cd16 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import tvm from tvm import autotvm from tvm import te diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index c5a6475ec913..d4bd62352a80 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import tvm from tvm import autotvm from tvm import te From 36f912856df0d1e0d5ef5c96a0b52e7630fa96af Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 27 Aug 2020 17:44:40 +0100 Subject: [PATCH 18/28] Minor bug fixes --- python/tvm/topi/x86/group_conv2d.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index d4bd62352a80..d0b25cc83e74 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -18,6 +18,8 @@ import tvm from tvm import autotvm from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + from .util import get_fp32_len from ..util import get_const_tuple from ..nn.pad import pad @@ -25,8 +27,6 @@ from ..nn.conv2d import _get_workload as _get_conv2d_workload -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity - def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): """Compute group_conv2d with NCHW layout""" @@ -67,16 +67,21 @@ def _fallback_schedule(cfg, wkl): oc_bn = 1 + oc_bn = 1 for bn in range(simd_width, 0, -1): if KPG % bn == 0: oc_bn = bn break + if oc_bn > KPG: + oc_bn = KPG ic_bn = 1 for bn in range(oc_bn, 0, -1): if CPG % bn == 0: ic_bn = bn break + if ic_bn > CPG: + ic_bn = CPG reg_n = 1 for n in range(31, 0, -1): From 12de6b8ba9a1b8594c6eb7b86de952f0ea8624d8 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 28 Aug 2020 13:19:23 +0100 Subject: [PATCH 19/28] Added asymmetric padding support Fixed linting --- python/tvm/topi/arm_cpu/group_conv2d.py | 49 +++++++++++++--------- python/tvm/topi/x86/group_conv2d.py | 56 +++++++++++++------------ 2 files changed, 59 insertions(+), 46 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index de7e16f5cd16..5bdc47cad01c 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" import tvm from tvm import autotvm @@ -64,16 +65,21 @@ def _fallback_schedule(cfg, wkl): oc_bn = 1 + oc_bn = 1 for bn in range(simd_width, 0, -1): - if KPG % bn == 0: + if kernels_per_group % bn == 0: oc_bn = bn break + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if CPG % bn == 0: + if kernel_depth % bn == 0: ic_bn = bn break + if ic_bn > kernel_depth: + ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -103,36 +109,39 @@ def group_conv2d_nchw_spatial_pack( assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - HPAD, WPAD = padding, padding + pt, pl, pb, pr = padding, padding, padding, padding elif len(padding) == 2: HPAD, WPAD = padding + pt, pb = HPAD, HPAD + pl, pr = WPAD, WPAD else: - HPAD, _, WPAD, _ = padding + pt, pl, pb, pr = padding + + HPAD = pt + pb + WPAD = pl + pr assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - HSTR, WSTR = strides, strides + stride_h, stride_w = strides, strides else: - HSTR, WSTR = strides + stride_h, stride_w = strides - N, CI, IH, IW = get_const_tuple(data.shape) - CO, CIG, KH, KW = get_const_tuple(kernel.shape) + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = IH + 2 * HPAD - pad_width = IW + 2 * WPAD + pad_height = in_height + pt + pb + pad_width = in_width + pl + pr - dilated_kernel_h = (KH - 1) * dilation_h + 1 - dilated_kernel_w = (KW - 1) * dilation_w + 1 - OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 - OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 - G = groups - KPG = CO // G - CPG = CI // G + kernels_per_group = out_channel // groups - cfg.define_split("tile_ic", CI, num_outputs=2) - cfg.define_split("tile_oc", CO, num_outputs=2) - cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index d0b25cc83e74..9eeba9f09fe3 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" import tvm from tvm import autotvm @@ -69,19 +70,19 @@ def _fallback_schedule(cfg, wkl): oc_bn = 1 for bn in range(simd_width, 0, -1): - if KPG % bn == 0: + if kernels_per_group % bn == 0: oc_bn = bn break - if oc_bn > KPG: - oc_bn = KPG + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group ic_bn = 1 for bn in range(oc_bn, 0, -1): - if CPG % bn == 0: + if kernel_depth % bn == 0: ic_bn = bn break - if ic_bn > CPG: - ic_bn = CPG + if ic_bn > kernel_depth: + ic_bn = kernel_depth reg_n = 1 for n in range(31, 0, -1): @@ -111,36 +112,39 @@ def group_conv2d_nchw_spatial_pack( assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - HPAD, WPAD = padding, padding + pt, pl, pb, pr = padding, padding, padding, padding elif len(padding) == 2: HPAD, WPAD = padding + pt, pb = HPAD, HPAD + pl, pr = WPAD, WPAD else: - HPAD, _, WPAD, _ = padding + pt, pl, pb, pr = padding + + HPAD = pt + pb + WPAD = pl + pr assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): - HSTR, WSTR = strides, strides + stride_h, stride_w = strides, strides else: - HSTR, WSTR = strides + stride_h, stride_w = strides - N, CI, IH, IW = get_const_tuple(data.shape) - CO, CIG, KH, KW = get_const_tuple(kernel.shape) + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = IH + 2 * HPAD - pad_width = IW + 2 * WPAD + pad_height = in_height + pt + pb + pad_width = in_width + pl + pr - dilated_kernel_h = (KH - 1) * dilation_h + 1 - dilated_kernel_w = (KW - 1) * dilation_w + 1 - OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1 - OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1 + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 - G = groups - KPG = CO // G - CPG = CI // G + kernels_per_group = out_channel // groups - cfg.define_split("tile_ic", CI, num_outputs=2) - cfg.define_split("tile_oc", CO, num_outputs=2) - cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) cfg.define_knob("unroll_kw", [True, False]) # If no config was set, we can fallback to default config. @@ -292,13 +296,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, cfg["unroll_kw"].val, ) - A, W = data, kernel_vec + _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() - groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) s[A1].parallel(parallel_axis) From ef858b4494009441e1c2623d4cb15aba2b2b97e4 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Fri, 28 Aug 2020 13:44:47 +0100 Subject: [PATCH 20/28] Improve linting --- python/tvm/topi/arm_cpu/group_conv2d.py | 26 +++++++++++++------------ python/tvm/topi/x86/group_conv2d.py | 22 ++++++++++----------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 5bdc47cad01c..bdcba2c35ecb 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -19,6 +19,8 @@ import tvm from tvm import autotvm from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + from ..util import get_const_tuple from ..nn.pad import pad from .. import tag @@ -109,16 +111,16 @@ def group_conv2d_nchw_spatial_pack( assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pt, pl, pb, pr = padding, padding, padding, padding + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding elif len(padding) == 2: - HPAD, WPAD = padding - pt, pb = HPAD, HPAD - pl, pr = WPAD, WPAD + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad else: - pt, pl, pb, pr = padding + pad_top, pad_left, pad_bottom, pad_right = padding - HPAD = pt + pb - WPAD = pl + pr + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): @@ -129,13 +131,13 @@ def group_conv2d_nchw_spatial_pack( batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = in_height + pt + pb - pad_width = in_width + pl + pr + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right dilated_kernel_h = (k_height - 1) * dilation_h + 1 dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 kernels_per_group = out_channel // groups @@ -295,7 +297,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, cfg["unroll_kw"].val, ) - A, W = data, kernel_vec + _, W = data, kernel_vec A0, A1 = data_pad, data_vec # schedule data diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 9eeba9f09fe3..4019068d4d8a 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -112,16 +112,16 @@ def group_conv2d_nchw_spatial_pack( assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 if isinstance(padding, int): - pt, pl, pb, pr = padding, padding, padding, padding + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding elif len(padding) == 2: - HPAD, WPAD = padding - pt, pb = HPAD, HPAD - pl, pr = WPAD, WPAD + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad else: - pt, pl, pb, pr = padding + pad_top, pad_left, pad_bottom, pad_right = padding - HPAD = pt + pb - WPAD = pl + pr + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right assert isinstance(strides, int) or len(strides) == 2 if isinstance(strides, int): @@ -132,13 +132,13 @@ def group_conv2d_nchw_spatial_pack( batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) - pad_height = in_height + pt + pb - pad_width = in_width + pl + pr + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right dilated_kernel_h = (k_height - 1) * dilation_h + 1 dilated_kernel_w = (k_width - 1) * dilation_w + 1 - out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1 - out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 kernels_per_group = out_channel // groups From 10d6e4018ecedc789989d712a6892103fb55a58c Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Mon, 31 Aug 2020 09:51:13 +0100 Subject: [PATCH 21/28] Better linting, disable final linting checks --- python/tvm/topi/arm_cpu/group_conv2d.py | 2 ++ python/tvm/topi/x86/group_conv2d.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index bdcba2c35ecb..64f179ba5bbb 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel """Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" import tvm diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 4019068d4d8a..0f1b1a87ac94 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel """Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" import tvm @@ -302,6 +304,8 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, # schedule data if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() + # s[A0].compute_at(s[A1]) + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis parallel_axis = s[A1].fuse(batch, ic_chunk, ih) @@ -342,8 +346,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) + # s[A1].compute_at(CC, parallel_axis) s[CC].parallel(parallel_axis) + + s[CC].vectorize(oc_block) s[CC].unroll(ow_block) From f468988cc5458db5e43de9fe76bcce856310b52d Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 3 Sep 2020 19:56:36 +0100 Subject: [PATCH 22/28] Fixed final linting errors (figured out how to run lint tests locally) --- python/tvm/topi/x86/group_conv2d.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 0f1b1a87ac94..c1f3ece0f323 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -304,7 +304,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, # schedule data if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() - # s[A0].compute_at(s[A1]) groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis @@ -346,10 +345,8 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) - # s[A1].compute_at(CC, parallel_axis) - s[CC].parallel(parallel_axis) - + s[CC].parallel(parallel_axis) s[CC].vectorize(oc_block) From 6fef83a0b64d21523c10e094506ad1c5ec209af1 Mon Sep 17 00:00:00 2001 From: Wheest Date: Mon, 21 Dec 2020 20:06:54 +0000 Subject: [PATCH 23/28] Update conv2d.py Fixed merge issue --- python/tvm/topi/nn/conv2d.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 80f87f86736c..a00df1fdbf0f 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -174,10 +174,14 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou else: KH, KW, CIG, CO = get_const_tuple(kernel.shape) +<<<<<<< HEAD pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) dilation_h, dilation_w = ( dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) ) +======= + HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) +>>>>>>> c57bd780a (Update conv2d.py) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride From cf87146ce075ed89a3763a4cff41d6834f514d5a Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Wed, 6 Jan 2021 18:25:53 +0000 Subject: [PATCH 24/28] Rebase, and update responding to some comments --- python/tvm/relay/op/strategy/x86.py | 10 +++++----- python/tvm/topi/arm_cpu/group_conv2d.py | 2 +- python/tvm/topi/nn/conv2d.py | 4 ---- python/tvm/topi/x86/group_conv2d.py | 4 ++-- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 60bd92ef63d1..98eb28ee0ef3 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("group_conv2d is not optimized for x86 with autotvm.") - strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), - name="group_conv2d_nhwc.generic", - ) + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), + name="group_conv2d_nhwc.generic", + ) else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 64f179ba5bbb..6a6e578803db 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -23,7 +23,7 @@ from tvm import te from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from ..util import get_const_tuple +from ..utils import get_const_tuple from ..nn.pad import pad from .. import tag diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index a00df1fdbf0f..80f87f86736c 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -174,14 +174,10 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou else: KH, KW, CIG, CO = get_const_tuple(kernel.shape) -<<<<<<< HEAD pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) dilation_h, dilation_w = ( dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) ) -======= - HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) ->>>>>>> c57bd780a (Update conv2d.py) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index c1f3ece0f323..7ac3a8928650 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -23,8 +23,8 @@ from tvm import te from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from .util import get_fp32_len -from ..util import get_const_tuple +from .utils import get_fp32_len +from ..utils import get_const_tuple from ..nn.pad import pad from .. import tag From 8f74750853f1e6abe489c9f6c52fd4c126fe3b3d Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Thu, 18 Mar 2021 15:37:15 +0000 Subject: [PATCH 25/28] Fixed AutoScheduler bug for NHWC case --- python/tvm/relay/op/strategy/x86.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 98eb28ee0ef3..60bd92ef63d1 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("group_conv2d is not optimized for x86 with autotvm.") - strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), - name="group_conv2d_nhwc.generic", - ) + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), + name="group_conv2d_nhwc.generic", + ) else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy From 53b4d5bd74a64b268d06026b3e91f5abd7254f4e Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Tue, 23 Mar 2021 14:19:33 +0000 Subject: [PATCH 26/28] Minor fix --- python/tvm/topi/x86/group_conv2d.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 7ac3a8928650..7e9c532fb15b 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -276,6 +276,7 @@ def traverse(op): s[kernel].compute_inline() data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] + data_pad = None if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] From fd23e11c2394f8abcf77e58f05985c3ab0149482 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Tue, 23 Mar 2021 16:54:05 +0000 Subject: [PATCH 27/28] Fixed removal of infer_pad to no padding --- python/tvm/topi/arm_cpu/group_conv2d.py | 2 +- python/tvm/topi/x86/group_conv2d.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index 6a6e578803db..c7fad430836c 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -303,7 +303,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, A0, A1 = data_pad, data_vec # schedule data - if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: + if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index 7e9c532fb15b..a6ac7e5ac5a3 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -303,7 +303,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, A0, A1 = data_pad, data_vec # schedule data - if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: + if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: s[A0].compute_inline() groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis From 7a2c3caa472e0fceaa8bb10709883f30d23de328 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Tue, 23 Mar 2021 20:01:31 +0000 Subject: [PATCH 28/28] Fixed unexpected linting error --- python/tvm/topi/arm_cpu/group_conv2d.py | 6 +++++- python/tvm/topi/x86/group_conv2d.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py index c7fad430836c..d852b9acef66 100644 --- a/python/tvm/topi/arm_cpu/group_conv2d.py +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -303,7 +303,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, A0, A1 = data_pad, data_vec # schedule data - if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: + if ( + data_pad is not None + and isinstance(data_pad.op, tvm.te.ComputeOp) + and "pad" in data_pad.op.tag + ): s[A0].compute_inline() groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py index a6ac7e5ac5a3..0501c5534cf2 100644 --- a/python/tvm/topi/x86/group_conv2d.py +++ b/python/tvm/topi/x86/group_conv2d.py @@ -303,7 +303,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, A0, A1 = data_pad, data_vec # schedule data - if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag: + if ( + data_pad is not None + and isinstance(data_pad.op, tvm.te.ComputeOp) + and "pad" in data_pad.op.tag + ): s[A0].compute_inline() groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis