From d572a5567a2a860696f2422b2029a763be950477 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Mon, 21 Dec 2020 15:33:59 +0000
Subject: [PATCH 01/28] integrated with v0.8

---
 python/tvm/relay/op/strategy/x86.py     |   8 +-
 python/tvm/topi/arm_cpu/__init__.py     |   1 +
 python/tvm/topi/arm_cpu/group_conv2d.py | 342 +++++++++++++++++++++++
 python/tvm/topi/x86/__init__.py         |   1 +
 python/tvm/topi/x86/group_conv2d.py     | 343 ++++++++++++++++++++++++
 5 files changed, 690 insertions(+), 5 deletions(-)
 create mode 100644 python/tvm/topi/arm_cpu/group_conv2d.py
 create mode 100644 python/tvm/topi/x86/group_conv2d.py

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 1f37a4f8e98c..60bd92ef63d1 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -205,12 +205,10 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
     else:  # group_conv2d
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
-            if not is_auto_scheduler_enabled():
-                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.generic",
+                wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.x86",
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
index e121fbc7ec6d..9e2057a7126f 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -26,3 +26,4 @@
 from .bitserial_dense import *
 from .injective import *
 from . import cortex_m7
+from .group_conv2d import *
diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
new file mode 100644
index 000000000000..bf0925698ec5
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -0,0 +1,342 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
+
+import tvm
+from tvm import autotvm
+from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
+from ..utils import get_const_tuple
+from ..nn.pad import pad
+from .. import tag
+
+from ..nn.utils import infer_pad
+from ..nn.conv2d import _get_workload as _get_conv2d_workload
+
+
+
+
+def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
+                      out_dtype):
+    """Compute group_conv2d with NCHW layout"""
+    return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding,
+                                          dilation, groups, out_dtype)
+
+
+def schedule_group_conv2d_nchw(outs):
+    """Compute group_conv2d with NCHW layout"""
+    return schedule_group_conv2d_nchwc(outs)
+
+
+def _get_default_config(cfg, data, kernel, strides, padding, groups,
+                        out_dtype, layout='NCHW'):
+    """
+    Get default schedule config for the workload
+    """
+    static_data_shape = []
+    for dim in get_const_tuple(data.shape):
+        if isinstance(dim, tvm.tir.Var):
+            static_data_shape.append(1)
+        else:
+            static_data_shape.append(dim)
+    data = te.placeholder(static_data_shape, dtype=data.dtype)
+
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype,
+                               layout)
+    _fallback_schedule(cfg, wkl)
+
+
+def _fallback_schedule(cfg, wkl):
+    simd_width = 4 # assume ARM SIMD Width is 4
+    pad_left, pad_right = wkl.padl, wkl.padr
+    stride_w = wkl.wstride
+    out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
+    groups = wkl.groups
+    kernels_per_group = wkl.out_filter // groups
+    kernel_depth = wkl.in_filter // groups
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if kernels_per_group % bn == 0:
+            oc_bn = bn
+            break
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if kernel_depth % bn == 0:
+            ic_bn = bn
+            break
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
+
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
+
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+    cfg["unroll_kw"] = OtherOptionEntity(False)
+
+
+@autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu")
+def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
+                                   dilation, groups, out_dtype='float32'):
+    """
+    Compute group conv2d with NCHW layout, using GSPC algorithm.
+    https://arxiv.org/abs/2006.09791
+    """
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(dilation, int):
+        dilation_h, dilation_w = dilation, dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
+    if isinstance(padding, int):
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+    elif len(padding) == 2:
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
+    else:
+        pad_top, pad_left, pad_bottom, pad_right = padding
+
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
+
+    assert isinstance(strides, int) or len(strides) == 2
+    if isinstance(strides, int):
+        stride_h, stride_w = strides, strides
+    else:
+        stride_h, stride_w = strides
+
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
+
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+
+    kernels_per_group = out_channel // groups
+
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_knob("unroll_kw", [True, False])
+
+    # If no config was set, we can fallback to default config.
+    if cfg.is_fallback:
+        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
+                                                dtype=data.dtype),
+                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
+                                           dtype=kernel.dtype),
+                            strides, padding, groups, out_dtype)
+
+    oc_bn = cfg['tile_oc'].size[-1]
+    ic_bn = cfg['tile_ic'].size[-1]
+    # pack data
+    DOPAD = (hpad != 0 or wpad != 0)
+    if DOPAD:
+        data_pad = pad(data,
+                       (0, 0, pad_top, pad_left),
+                       (0, 0, pad_bottom, pad_right),
+                       name="data_pad")
+    else:
+        data_pad = data
+
+    shape = (groups, batch_size, kernel_depth // ic_bn,
+             pad_height, ic_bn, pad_width)
+
+    data_vec = te.compute(shape,
+                          lambda g, n, C, h, c, w:
+                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+                          name='data_vec')
+
+    # pack kernel
+    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
+             k_height, k_width, ic_bn, oc_bn)
+    kernel_vec = te.compute(shape,
+                            lambda g, out_channel, in_channel, h, w, ci, co:
+                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
+                                   in_channel * ic_bn + ci, h, w],
+                            name='kernel_vec')
+
+    # convolution
+    oshape = (groups, batch_size, kernels_per_group//oc_bn,
+              out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, out_channel, out_height, out_width)
+
+    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
+    kh = te.reduce_axis((0, k_height), name='kh')
+    kw = te.reduce_axis((0, k_width), name='kw')
+    idxmod = tvm.tir.indexmod
+    idxdiv = tvm.tir.indexdiv
+
+    conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
+                      te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
+                                      oh * stride_h + kh * dilation_h,
+                                      idxmod(ic, ic_bn),
+                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
+                             kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
+                                        kh, kw, idxmod(ic, ic_bn),
+                                        oc_block].astype(out_dtype),
+                             axis=[ic, kh, kw]), name='conv')
+
+    unpack = te.compute(unpack_shape,
+                        lambda n, c, h, w:
+                        conv[idxdiv(c, kernels_per_group), n,
+                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+                             h, w,
+                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
+                        .astype(out_dtype),
+                        name='output_unpack',
+                        tag='group_conv2d_nchw')
+    return unpack
+
+
+@autotvm.register_topi_schedule("group_conv2d_nchw.arm_cpu")
+def schedule_group_conv2d_nchwc(cfg, outs):
+    """Create schedule for tensors"""
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+
+        if 'group_conv2d_nchw' in op.tag:
+            output = op.output(0)
+
+            if "tile_ic" not in cfg:
+                return
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel = kernel_vec.op.input_tensors[0]
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+            data_vec = conv_out.op.input_tensors[0]
+            data = data_vec.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
+                    output, outs[0]]
+            _schedule_gspc_nchw(*args)
+
+        scheduled_ops.append(op)
+
+    traverse(outs[0].op)
+    return s
+
+
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
+                        conv_out, output, last):
+    """Schedule GSPC"""
+    ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                                      cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
+
+    # no stride and padding info here
+    padding = infer_pad(data, data_pad)
+    hpad, wpad = padding
+    DOPAD = (hpad != 0 or wpad != 0)
+
+    _, W = data, kernel_vec
+    A0, A1 = data_pad, data_vec
+
+    # schedule data
+    if DOPAD:
+        s[A0].compute_inline()
+    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
+
+    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
+    s[A1].parallel(parallel_axis)
+
+    # schedule kernel pack
+    groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
+    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+
+    if oc_bn > 1:
+        s[W].vectorize(oc_block)
+
+    parallel_axis = s[W].fuse(groups, oc_chunk, oh)
+    s[W].parallel(parallel_axis)
+
+    # schedule conv
+    C, O0, O = conv_out, output, last
+    CC = s.cache_write(C, 'global')
+
+    _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
+
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+
+    groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+
+    ic, kh, kw = s[CC].op.reduce_axis
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
+
+    if unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw,
+                      ow_block, oc_block)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block,
+                      ow_block, oc_block)
+
+    parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
+    s[CC].parallel(parallel_axis)
+
+    s[CC].vectorize(oc_block)
+
+    s[CC].unroll(ow_block)
+
+    if O0 != O:
+        s[O0].compute_inline()
+
+    batch, oc, oh, ow = s[O].op.axis
+    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+
+    s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[O].fuse(oc_chunk, oh)
+    s[O].vectorize(oc_block)
+    s[O].parallel(parallel_axis)
+    return s
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index bb6a7cdd4122..d1bd58dd4831 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -41,3 +41,4 @@
 from .conv2d_alter_op import *
 from .dense_alter_op import *
 from .scatter import *
+from .group_conv2d import *
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
new file mode 100644
index 000000000000..158795eb4e17
--- /dev/null
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -0,0 +1,343 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
+
+import tvm
+from tvm import autotvm
+from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
+from .utils import get_fp32_len
+from ..utils import get_const_tuple
+from ..nn.pad import pad
+from .. import tag
+
+from ..nn.utils import infer_pad
+from ..nn.conv2d import _get_workload as _get_conv2d_workload
+
+
+def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
+                      out_dtype):
+    """Compute group_conv2d with NCHW layout"""
+    return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding,
+                                          dilation, groups, out_dtype)
+
+
+def schedule_group_conv2d_nchw(outs):
+    """Compute group_conv2d with NCHW layout"""
+    return schedule_group_conv2d_nchwc(outs)
+
+
+def _get_default_config(cfg, data, kernel, strides, padding, groups,
+                        out_dtype, layout='NCHW'):
+    """
+    Get default schedule config for the workload
+    """
+    static_data_shape = []
+    for dim in get_const_tuple(data.shape):
+        if isinstance(dim, tvm.tir.Var):
+            static_data_shape.append(1)
+        else:
+            static_data_shape.append(dim)
+    data = te.placeholder(static_data_shape, dtype=data.dtype)
+
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype,
+                               layout)
+    _fallback_schedule(cfg, wkl)
+
+
+def _fallback_schedule(cfg, wkl):
+    simd_width = get_fp32_len()
+    hpad = wkl.hpad
+    stride_w = wkl.wstride
+    out_width = (wkl.width + 2*hpad - wkl.wkernel) // stride_w + 1
+    groups = wkl.groups
+    kernels_per_group = wkl.out_filter // groups
+    kernel_depth = wkl.in_filter // groups
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if kernels_per_group % bn == 0:
+            oc_bn = bn
+            break
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if kernel_depth % bn == 0:
+            ic_bn = bn
+            break
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
+
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
+
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+    cfg["unroll_kw"] = OtherOptionEntity(False)
+
+
+@autotvm.register_topi_compute("group_conv2d_nchw.x86")
+def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
+                                   dilation, groups, out_dtype='float32'):
+    """
+    Compute group conv2d with NCHW layout, using GSPC algorithm.
+    https://arxiv.org/abs/2006.09791
+    """
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(dilation, int):
+        dilation_h, dilation_w = dilation, dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
+    if isinstance(padding, int):
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+    elif len(padding) == 2:
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
+    else:
+        pad_top, pad_left, pad_bottom, pad_right = padding
+
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
+
+    assert isinstance(strides, int) or len(strides) == 2
+    if isinstance(strides, int):
+        stride_h, stride_w = strides, strides
+    else:
+        stride_h, stride_w = strides
+
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
+
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+
+    kernels_per_group = out_channel // groups
+
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_knob("unroll_kw", [True, False])
+
+    # If no config was set, we can fallback to default config.
+    if cfg.is_fallback:
+        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
+                                                dtype=data.dtype),
+                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
+                                           dtype=kernel.dtype),
+                            strides, padding, groups, out_dtype)
+
+    oc_bn = cfg['tile_oc'].size[-1]
+    ic_bn = cfg['tile_ic'].size[-1]
+    # pack data
+    DOPAD = (hpad != 0 or wpad != 0)
+    if DOPAD:
+        data_pad = pad(data,
+                       (0, 0, pad_top, pad_left),
+                       (0, 0, pad_bottom, pad_right),
+                       name="data_pad")
+    else:
+        data_pad = data
+
+    shape = (groups, batch_size, kernel_depth // ic_bn,
+             pad_height, ic_bn, pad_width)
+
+    data_vec = te.compute(shape,
+                          lambda g, n, C, h, c, w:
+                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+                          name='data_vec')
+
+    # pack kernel
+    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
+             k_height, k_width, ic_bn, oc_bn)
+    kernel_vec = te.compute(shape,
+                            lambda g, out_channel, in_channel, h, w, ci, co:
+                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
+                                   in_channel * ic_bn + ci, h, w],
+                            name='kernel_vec')
+
+    # convolution
+    oshape = (groups, batch_size, kernels_per_group//oc_bn,
+              out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, out_channel, out_height, out_width)
+
+    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
+    kh = te.reduce_axis((0, k_height), name='kh')
+    kw = te.reduce_axis((0, k_width), name='kw')
+    idxmod = tvm.tir.indexmod
+    idxdiv = tvm.tir.indexdiv
+
+    conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
+                      te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
+                                      oh * stride_h + kh * dilation_h,
+                                      idxmod(ic, ic_bn),
+                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
+                             kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
+                                        kh, kw, idxmod(ic, ic_bn),
+                                        oc_block].astype(out_dtype),
+                             axis=[ic, kh, kw]), name='conv')
+
+    unpack = te.compute(unpack_shape,
+                        lambda n, c, h, w:
+                        conv[idxdiv(c, kernels_per_group), n,
+                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+                             h, w,
+                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
+                        .astype(out_dtype),
+                        name='output_unpack',
+                        tag='group_conv2d_nchw')
+    return unpack
+
+
+@autotvm.register_topi_schedule("group_conv2d_nchw.x86")
+def schedule_group_conv2d_nchwc(cfg, outs):
+    """Create schedule for tensors"""
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+
+        if 'group_conv2d_nchw' in op.tag:
+            output = op.output(0)
+
+            if "tile_ic" not in cfg:
+                return
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel = kernel_vec.op.input_tensors[0]
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+            data_vec = conv_out.op.input_tensors[0]
+            data = data_vec.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
+                    output, outs[0]]
+            _schedule_gspc_nchw(*args)
+
+        scheduled_ops.append(op)
+
+    traverse(outs[0].op)
+    return s
+
+
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
+                        conv_out, output, last):
+    """Schedule GSPC"""
+    ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                                      cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
+
+    # no stride and padding info here
+    padding = infer_pad(data, data_pad)
+    hpad, wpad = padding
+    DOPAD = (hpad != 0 or wpad != 0)
+
+    _, W = data, kernel_vec
+    A0, A1 = data_pad, data_vec
+
+    # schedule data
+    if DOPAD:
+        s[A0].compute_inline()
+
+    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
+
+    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
+    s[A1].parallel(parallel_axis)
+
+    # schedule kernel pack
+    groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
+    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+
+    if oc_bn > 1:
+        s[W].vectorize(oc_block)
+
+    parallel_axis = s[W].fuse(groups, oc_chunk, oh)
+    s[W].parallel(parallel_axis)
+
+    # schedule conv
+    C, O0, O = conv_out, output, last
+    CC = s.cache_write(C, 'global')
+
+    _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
+
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+
+    groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+
+    ic, kh, kw = s[CC].op.reduce_axis
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
+
+    if unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw,
+                      ow_block, oc_block)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block,
+                      ow_block, oc_block)
+
+    parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
+
+    s[CC].parallel(parallel_axis)
+
+    s[CC].vectorize(oc_block)
+
+    s[CC].unroll(ow_block)
+
+    if O0 != O:
+        s[O0].compute_inline()
+
+    batch, oc, oh, ow = s[O].op.axis
+    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+
+    s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[O].fuse(oc_chunk, oh)
+    s[O].vectorize(oc_block)
+    s[O].parallel(parallel_axis)
+    return s

From f32132bed42e71d2f00e1e1e75d2d168177de18c Mon Sep 17 00:00:00 2001
From: Wheest <perry@gibsonic.org>
Date: Sun, 26 Jul 2020 15:16:05 +0100
Subject: [PATCH 02/28] Rebase, and undoing accidental removal of auto
 scheduler NHWC support

---
 python/tvm/relay/op/strategy/arm_cpu.py |   7 +-
 python/tvm/topi/arm_cpu/group_conv2d.py | 168 ++++++++++-------------
 python/tvm/topi/x86/group_conv2d.py     | 172 ++++++++++--------------
 3 files changed, 141 insertions(+), 206 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 985124e305ee..36200bb024a9 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -207,11 +207,10 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     else:  # group_conv2d
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
-            logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.")
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.generic",
+                wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.arm_cpu"
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index bf0925698ec5..c8ae9bc46fbe 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -1,36 +1,15 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
-
 import tvm
 from tvm import autotvm
 from tvm import te
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
-from ..utils import get_const_tuple
+from ..util import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.utils import infer_pad
+from ..nn.conv2d import group_conv2d_nchw
+from ..nn.util import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
-
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 
 def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
@@ -45,8 +24,8 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups,
-                        out_dtype, layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
+                        layout='NCHW'):
     """
     Get default schedule config for the workload
     """
@@ -65,28 +44,24 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups,
 
 def _fallback_schedule(cfg, wkl):
     simd_width = 4 # assume ARM SIMD Width is 4
-    pad_left, pad_right = wkl.padl, wkl.padr
-    stride_w = wkl.wstride
-    out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
-    groups = wkl.groups
-    kernels_per_group = wkl.out_filter // groups
-    kernel_depth = wkl.in_filter // groups
-
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    G = wkl.groups
+    KPG = wkl.out_filter // G
+    CPG = wkl.in_filter // G
     oc_bn = 1
+
     for bn in range(simd_width, 0, -1):
-        if kernels_per_group % bn == 0:
+        if KPG % bn == 0:
             oc_bn = bn
             break
-    if oc_bn > kernels_per_group:
-        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if kernel_depth % bn == 0:
+        if CPG % bn == 0:
             ic_bn = bn
             break
-    if ic_bn > kernel_depth:
-        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -103,10 +78,6 @@ def _fallback_schedule(cfg, wkl):
 @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu")
 def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
                                    dilation, groups, out_dtype='float32'):
-    """
-    Compute group conv2d with NCHW layout, using GSPC algorithm.
-    https://arxiv.org/abs/2006.09791
-    """
     assert isinstance(dilation, int) or len(dilation) == 2
     if isinstance(dilation, int):
         dilation_h, dilation_w = dilation, dilation
@@ -115,94 +86,87 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+        HPAD, WPAD = padding, padding
     elif len(padding) == 2:
-        hpad, wpad = padding
-        pad_top, pad_bottom = hpad, hpad
-        pad_left, pad_right = wpad, wpad
+        HPAD, WPAD = padding
     else:
-        pad_top, pad_left, pad_bottom, pad_right = padding
-
-    hpad = pad_top + pad_bottom
-    wpad = pad_left + pad_right
+        HPAD, _, WPAD, _ = padding
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        stride_h, stride_w = strides, strides
+        HSTR, WSTR = strides, strides
     else:
-        stride_h, stride_w = strides
+        HSTR, WSTR = strides
 
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pad_top + pad_bottom
-    pad_width = in_width + pad_left + pad_right
+    pad_height = IH + 2 * HPAD
+    pad_width = IW + 2 * WPAD
 
-    dilated_kernel_h = (k_height - 1) * dilation_h + 1
-    dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
+    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
 
-    kernels_per_group = out_channel // groups
+    G = groups
+    KPG = CO // G
+    CPG = CI // G
 
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", CI, num_outputs=2)
+    cfg.define_split("tile_oc", CO, num_outputs=2)
+    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
-                                                dtype=data.dtype),
-                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
+        _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype),
+                            te.placeholder((N, CI // G, KH, KW),
                                            dtype=kernel.dtype),
                             strides, padding, groups, out_dtype)
 
     oc_bn = cfg['tile_oc'].size[-1]
     ic_bn = cfg['tile_ic'].size[-1]
     # pack data
-    DOPAD = (hpad != 0 or wpad != 0)
+    DOPAD = (HPAD != 0 or WPAD != 0)
     if DOPAD:
-        data_pad = pad(data,
-                       (0, 0, pad_top, pad_left),
-                       (0, 0, pad_bottom, pad_right),
-                       name="data_pad")
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
     else:
         data_pad = data
 
-    shape = (groups, batch_size, kernel_depth // ic_bn,
+    shape = (G, N, CPG // ic_bn,
              pad_height, ic_bn, pad_width)
 
     data_vec = te.compute(shape,
                           lambda g, n, C, h, c, w:
-                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+                          data_pad[n, C * ic_bn + c + CPG * g, h, w],
                           name='data_vec')
 
     # pack kernel
-    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
-             k_height, k_width, ic_bn, oc_bn)
+    shape = (G, KPG//oc_bn, CPG//ic_bn,
+             KH, KW, ic_bn, oc_bn)
     kernel_vec = te.compute(shape,
-                            lambda g, out_channel, in_channel, h, w, ci, co:
-                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
-                                   in_channel * ic_bn + ci, h, w],
+                            lambda g, CO, CI, h, w, ci, co:
+                            kernel[(CO * oc_bn + co + g * KPG),
+                                   CI * ic_bn + ci, h, w],
                             name='kernel_vec')
 
     # convolution
-    oshape = (groups, batch_size, kernels_per_group//oc_bn,
-              out_height, out_width, oc_bn)
-    unpack_shape = (batch_size, out_channel, out_height, out_width)
+    oshape = (G, N, KPG//oc_bn,
+              OH, OW, oc_bn)
+    unpack_shape = (N, CO, OH, OW)
 
-    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
-    kh = te.reduce_axis((0, k_height), name='kh')
-    kw = te.reduce_axis((0, k_width), name='kw')
+    ic = te.reduce_axis((0, (CPG)), name='ic')
+    kh = te.reduce_axis((0, KH), name='kh')
+    kw = te.reduce_axis((0, KW), name='kw')
     idxmod = tvm.tir.indexmod
     idxdiv = tvm.tir.indexdiv
 
     conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
                       te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
-                                      oh * stride_h + kh * dilation_h,
+                                      oh*HSTR+kh*dilation_h,
                                       idxmod(ic, ic_bn),
-                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
+                                      ow*WSTR+kw*dilation_w].astype(out_dtype) *
                              kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
                                         kh, kw, idxmod(ic, ic_bn),
                                         oc_block].astype(out_dtype),
@@ -210,10 +174,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     unpack = te.compute(unpack_shape,
                         lambda n, c, h, w:
-                        conv[idxdiv(c, kernels_per_group), n,
-                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+                        conv[idxdiv(c, KPG), n,
+                             idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)),
                              h, w,
-                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
+                             idxmod(idxmod(c, oc_bn), KPG)]
                         .astype(out_dtype),
                         name='output_unpack',
                         tag='group_conv2d_nchw')
@@ -253,9 +217,12 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
+            _, c, h, w = get_const_tuple(data.shape)
+            _, x, kh, kw = get_const_tuple(kernel.shape)
+
             args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
                     output, outs[0]]
-            _schedule_gspc_nchw(*args)
+            schedule_conv_sp_grouped(*args)
 
         scheduled_ops.append(op)
 
@@ -263,24 +230,25 @@ def traverse(op):
     return s
 
 
-def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
-                        conv_out, output, last):
-    """Schedule GSPC"""
+def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec,
+                             conv_out, output, last,
+                             **kwargs):
+    # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
                                       cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    hpad, wpad = padding
-    DOPAD = (hpad != 0 or wpad != 0)
+    HPAD, WPAD = padding
+    DOPAD = (HPAD != 0 or WPAD != 0)
 
-    _, W = data, kernel_vec
+    A, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
     if DOPAD:
         s[A0].compute_inline()
-    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
+    groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
     s[A1].parallel(parallel_axis)
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 158795eb4e17..310257099e8c 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -1,36 +1,17 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
-
 import tvm
 from tvm import autotvm
 from tvm import te
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
-from .utils import get_fp32_len
-from ..utils import get_const_tuple
+from .util import get_fp32_len
+from ..util import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.utils import infer_pad
+from ..nn.conv2d import group_conv2d_nchw
+from ..nn.util import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
 
 def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
                       out_dtype):
@@ -44,8 +25,8 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups,
-                        out_dtype, layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
+                        layout='NCHW'):
     """
     Get default schedule config for the workload
     """
@@ -64,28 +45,24 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups,
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    hpad = wkl.hpad
-    stride_w = wkl.wstride
-    out_width = (wkl.width + 2*hpad - wkl.wkernel) // stride_w + 1
-    groups = wkl.groups
-    kernels_per_group = wkl.out_filter // groups
-    kernel_depth = wkl.in_filter // groups
-
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    G = wkl.groups
+    KPG = wkl.out_filter // G
+    CPG = wkl.in_filter // G
     oc_bn = 1
+
     for bn in range(simd_width, 0, -1):
-        if kernels_per_group % bn == 0:
+        if KPG % bn == 0:
             oc_bn = bn
             break
-    if oc_bn > kernels_per_group:
-        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if kernel_depth % bn == 0:
+        if CPG % bn == 0:
             ic_bn = bn
             break
-    if ic_bn > kernel_depth:
-        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -102,10 +79,6 @@ def _fallback_schedule(cfg, wkl):
 @autotvm.register_topi_compute("group_conv2d_nchw.x86")
 def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
                                    dilation, groups, out_dtype='float32'):
-    """
-    Compute group conv2d with NCHW layout, using GSPC algorithm.
-    https://arxiv.org/abs/2006.09791
-    """
     assert isinstance(dilation, int) or len(dilation) == 2
     if isinstance(dilation, int):
         dilation_h, dilation_w = dilation, dilation
@@ -114,94 +87,87 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+        HPAD, WPAD = padding, padding
     elif len(padding) == 2:
-        hpad, wpad = padding
-        pad_top, pad_bottom = hpad, hpad
-        pad_left, pad_right = wpad, wpad
+        HPAD, WPAD = padding
     else:
-        pad_top, pad_left, pad_bottom, pad_right = padding
-
-    hpad = pad_top + pad_bottom
-    wpad = pad_left + pad_right
+        HPAD, _, WPAD, _ = padding
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        stride_h, stride_w = strides, strides
+        HSTR, WSTR = strides, strides
     else:
-        stride_h, stride_w = strides
+        HSTR, WSTR = strides
 
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pad_top + pad_bottom
-    pad_width = in_width + pad_left + pad_right
+    pad_height = IH + 2 * HPAD
+    pad_width = IW + 2 * WPAD
 
-    dilated_kernel_h = (k_height - 1) * dilation_h + 1
-    dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
+    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
 
-    kernels_per_group = out_channel // groups
+    G = groups
+    KPG = CO // G
+    CPG = CI // G
 
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", CI, num_outputs=2)
+    cfg.define_split("tile_oc", CO, num_outputs=2)
+    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
-                                                dtype=data.dtype),
-                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
+        _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype),
+                            te.placeholder((N, CI // G, KH, KW),
                                            dtype=kernel.dtype),
                             strides, padding, groups, out_dtype)
 
     oc_bn = cfg['tile_oc'].size[-1]
     ic_bn = cfg['tile_ic'].size[-1]
     # pack data
-    DOPAD = (hpad != 0 or wpad != 0)
+    DOPAD = (HPAD != 0 or WPAD != 0)
     if DOPAD:
-        data_pad = pad(data,
-                       (0, 0, pad_top, pad_left),
-                       (0, 0, pad_bottom, pad_right),
-                       name="data_pad")
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
     else:
         data_pad = data
 
-    shape = (groups, batch_size, kernel_depth // ic_bn,
+    shape = (G, N, CPG // ic_bn,
              pad_height, ic_bn, pad_width)
 
     data_vec = te.compute(shape,
                           lambda g, n, C, h, c, w:
-                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+                          data_pad[n, C * ic_bn + c + CPG * g, h, w],
                           name='data_vec')
 
     # pack kernel
-    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
-             k_height, k_width, ic_bn, oc_bn)
+    shape = (G, KPG//oc_bn, CPG//ic_bn,
+             KH, KW, ic_bn, oc_bn)
     kernel_vec = te.compute(shape,
-                            lambda g, out_channel, in_channel, h, w, ci, co:
-                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
-                                   in_channel * ic_bn + ci, h, w],
+                            lambda g, CO, CI, h, w, ci, co:
+                            kernel[(CO * oc_bn + co + g * KPG),
+                                   CI * ic_bn + ci, h, w],
                             name='kernel_vec')
 
     # convolution
-    oshape = (groups, batch_size, kernels_per_group//oc_bn,
-              out_height, out_width, oc_bn)
-    unpack_shape = (batch_size, out_channel, out_height, out_width)
+    oshape = (G, N, KPG//oc_bn,
+              OH, OW, oc_bn)
+    unpack_shape = (N, CO, OH, OW)
 
-    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
-    kh = te.reduce_axis((0, k_height), name='kh')
-    kw = te.reduce_axis((0, k_width), name='kw')
+    ic = te.reduce_axis((0, (CPG)), name='ic')
+    kh = te.reduce_axis((0, KH), name='kh')
+    kw = te.reduce_axis((0, KW), name='kw')
     idxmod = tvm.tir.indexmod
     idxdiv = tvm.tir.indexdiv
 
     conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
                       te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
-                                      oh * stride_h + kh * dilation_h,
+                                      oh*HSTR+kh*dilation_h,
                                       idxmod(ic, ic_bn),
-                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
+                                      ow*WSTR+kw*dilation_w].astype(out_dtype) *
                              kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
                                         kh, kw, idxmod(ic, ic_bn),
                                         oc_block].astype(out_dtype),
@@ -209,10 +175,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     unpack = te.compute(unpack_shape,
                         lambda n, c, h, w:
-                        conv[idxdiv(c, kernels_per_group), n,
-                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+                        conv[idxdiv(c, KPG), n,
+                             idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)),
                              h, w,
-                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
+                             idxmod(idxmod(c, oc_bn), KPG)]
                         .astype(out_dtype),
                         name='output_unpack',
                         tag='group_conv2d_nchw')
@@ -252,9 +218,12 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
+            _, c, h, w = get_const_tuple(data.shape)
+            _, x, kh, kw = get_const_tuple(kernel.shape)
+
             args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
                     output, outs[0]]
-            _schedule_gspc_nchw(*args)
+            schedule_conv_sp_grouped(*args)
 
         scheduled_ops.append(op)
 
@@ -262,25 +231,25 @@ def traverse(op):
     return s
 
 
-def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
-                        conv_out, output, last):
-    """Schedule GSPC"""
+def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec,
+                             conv_out, output, last,
+                             **kwargs):
+    # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
                                       cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    hpad, wpad = padding
-    DOPAD = (hpad != 0 or wpad != 0)
+    HPAD, WPAD = padding
+    DOPAD = (HPAD != 0 or WPAD != 0)
 
-    _, W = data, kernel_vec
+    A, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
     if DOPAD:
         s[A0].compute_inline()
-
-    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
+    groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
     s[A1].parallel(parallel_axis)
@@ -322,7 +291,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
                       ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
-
     s[CC].parallel(parallel_axis)
 
     s[CC].vectorize(oc_block)

From ca5c67f02aa90f276317e8a80187547ac11a1b1b Mon Sep 17 00:00:00 2001
From: Perry Gibson <Perry.Gibson@glasgow.ac.uk>
Date: Thu, 27 Aug 2020 17:01:36 +0100
Subject: [PATCH 03/28] Added ASF license header

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 17 +++++++++++++++++
 python/tvm/topi/x86/group_conv2d.py     | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index c8ae9bc46fbe..d2f93b381f77 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import tvm
 from tvm import autotvm
 from tvm import te
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 310257099e8c..2afcc1b1e200 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import tvm
 from tvm import autotvm
 from tvm import te

From 737d36f05cf0947d09188065b474edbef3a5dd79 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Thu, 27 Aug 2020 17:44:40 +0100
Subject: [PATCH 04/28] Minor bug fixes

---
 python/tvm/topi/x86/group_conv2d.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 2afcc1b1e200..53d1c13a3e39 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -18,6 +18,8 @@
 import tvm
 from tvm import autotvm
 from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
 from .util import get_fp32_len
 from ..util import get_const_tuple
 from ..nn.pad import pad
@@ -27,8 +29,6 @@
 from ..nn.util import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
 
 def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
                       out_dtype):
@@ -68,18 +68,22 @@ def _fallback_schedule(cfg, wkl):
     G = wkl.groups
     KPG = wkl.out_filter // G
     CPG = wkl.in_filter // G
-    oc_bn = 1
 
+    oc_bn = 1
     for bn in range(simd_width, 0, -1):
         if KPG % bn == 0:
             oc_bn = bn
             break
+    if oc_bn > KPG:
+        oc_bn = KPG
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
         if CPG % bn == 0:
             ic_bn = bn
             break
+    if ic_bn > CPG:
+        ic_bn = CPG
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -139,7 +143,7 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
         _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype),
-                            te.placeholder((N, CI // G, KH, KW),
+                            te.placeholder((CO, CI // G, KH, KW),
                                            dtype=kernel.dtype),
                             strides, padding, groups, out_dtype)
 

From e22adbe921c71b8632490d4e4aba5aabdb390663 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Fri, 28 Aug 2020 13:19:23 +0100
Subject: [PATCH 05/28] Added asymmetric padding support Fixed linting

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 109 +++++++++++----------
 python/tvm/topi/x86/group_conv2d.py     | 121 ++++++++++++------------
 2 files changed, 122 insertions(+), 108 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index d2f93b381f77..6066c7060ca4 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
 
 import tvm
 from tvm import autotvm
@@ -55,30 +56,34 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
     data = te.placeholder(static_data_shape, dtype=data.dtype)
 
     wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype,
-                               layout)
+                               layout, asymmetric_pad=True)
     _fallback_schedule(cfg, wkl)
 
 
 def _fallback_schedule(cfg, wkl):
     simd_width = 4 # assume ARM SIMD Width is 4
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
-    G = wkl.groups
-    KPG = wkl.out_filter // G
-    CPG = wkl.in_filter // G
-    oc_bn = 1
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    stride_w = wkl.wstride
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1
+    groups = wkl.groups
+    kernels_per_group = wkl.out_filter // groups
+    kernel_depth = wkl.in_filter // groups
 
+    oc_bn = 1
     for bn in range(simd_width, 0, -1):
-        if KPG % bn == 0:
+        if kernels_per_group % bn == 0:
             oc_bn = bn
             break
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if CPG % bn == 0:
+        if kernel_depth % bn == 0:
             ic_bn = bn
             break
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -103,42 +108,45 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        HPAD, WPAD = padding, padding
+        pt, pl, pb, pr = padding, padding, padding, padding
     elif len(padding) == 2:
         HPAD, WPAD = padding
+        pt, pb = HPAD, HPAD
+        pl, pr = WPAD, WPAD
     else:
-        HPAD, _, WPAD, _ = padding
+        pt, pl, pb, pr = padding
+
+    HPAD = pt + pb
+    WPAD = pl + pr
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        HSTR, WSTR = strides, strides
+        stride_h, stride_w = strides, strides
     else:
-        HSTR, WSTR = strides
+        stride_h, stride_w = strides
 
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = IH + 2 * HPAD
-    pad_width = IW + 2 * WPAD
+    pad_height = in_height + pt + pb
+    pad_width = in_width + pl + pr
 
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
-    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
 
-    G = groups
-    KPG = CO // G
-    CPG = CI // G
+    kernels_per_group = out_channel // groups
 
-    cfg.define_split("tile_ic", CI, num_outputs=2)
-    cfg.define_split("tile_oc", CO, num_outputs=2)
-    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype),
-                            te.placeholder((N, CI // G, KH, KW),
+        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
                                            dtype=kernel.dtype),
                             strides, padding, groups, out_dtype)
 
@@ -147,43 +155,46 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     # pack data
     DOPAD = (HPAD != 0 or WPAD != 0)
     if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+        data_pad = pad(data,
+                       (0, 0, pt, pl),
+                       (0, 0, pb, pr),
+                       name="data_pad")
     else:
         data_pad = data
 
-    shape = (G, N, CPG // ic_bn,
+    shape = (groups, batch_size, kernel_depth // ic_bn,
              pad_height, ic_bn, pad_width)
 
     data_vec = te.compute(shape,
                           lambda g, n, C, h, c, w:
-                          data_pad[n, C * ic_bn + c + CPG * g, h, w],
+                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
                           name='data_vec')
 
     # pack kernel
-    shape = (G, KPG//oc_bn, CPG//ic_bn,
-             KH, KW, ic_bn, oc_bn)
+    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
+             k_height, k_width, ic_bn, oc_bn)
     kernel_vec = te.compute(shape,
-                            lambda g, CO, CI, h, w, ci, co:
-                            kernel[(CO * oc_bn + co + g * KPG),
-                                   CI * ic_bn + ci, h, w],
+                            lambda g, out_channel, in_channel, h, w, ci, co:
+                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
+                                   in_channel * ic_bn + ci, h, w],
                             name='kernel_vec')
 
     # convolution
-    oshape = (G, N, KPG//oc_bn,
-              OH, OW, oc_bn)
-    unpack_shape = (N, CO, OH, OW)
+    oshape = (groups, batch_size, kernels_per_group//oc_bn,
+              out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, out_channel, out_height, out_width)
 
-    ic = te.reduce_axis((0, (CPG)), name='ic')
-    kh = te.reduce_axis((0, KH), name='kh')
-    kw = te.reduce_axis((0, KW), name='kw')
+    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
+    kh = te.reduce_axis((0, k_height), name='kh')
+    kw = te.reduce_axis((0, k_width), name='kw')
     idxmod = tvm.tir.indexmod
     idxdiv = tvm.tir.indexdiv
 
     conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
                       te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
-                                      oh*HSTR+kh*dilation_h,
+                                      oh * stride_h + kh * dilation_h,
                                       idxmod(ic, ic_bn),
-                                      ow*WSTR+kw*dilation_w].astype(out_dtype) *
+                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
                              kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
                                         kh, kw, idxmod(ic, ic_bn),
                                         oc_block].astype(out_dtype),
@@ -191,10 +202,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     unpack = te.compute(unpack_shape,
                         lambda n, c, h, w:
-                        conv[idxdiv(c, KPG), n,
-                             idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)),
+                        conv[idxdiv(c, kernels_per_group), n,
+                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
                              h, w,
-                             idxmod(idxmod(c, oc_bn), KPG)]
+                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
                         .astype(out_dtype),
                         name='output_unpack',
                         tag='group_conv2d_nchw')
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 53d1c13a3e39..e56d2db912f8 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
 
 import tvm
 from tvm import autotvm
@@ -56,34 +57,34 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
     data = te.placeholder(static_data_shape, dtype=data.dtype)
 
     wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype,
-                               layout)
+                               layout, asymmetric_pad=True)
     _fallback_schedule(cfg, wkl)
 
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
-    G = wkl.groups
-    KPG = wkl.out_filter // G
-    CPG = wkl.in_filter // G
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    stride_w = wkl.wstride
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1
+    groups = wkl.groups
+    kernels_per_group = wkl.out_filter // groups
+    kernel_depth = wkl.in_filter // groups
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
-        if KPG % bn == 0:
+        if kernels_per_group % bn == 0:
             oc_bn = bn
             break
-    if oc_bn > KPG:
-        oc_bn = KPG
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if CPG % bn == 0:
+        if kernel_depth % bn == 0:
             ic_bn = bn
             break
-    if ic_bn > CPG:
-        ic_bn = CPG
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -108,42 +109,45 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        HPAD, WPAD = padding, padding
+        pt, pl, pb, pr = padding, padding, padding, padding
     elif len(padding) == 2:
         HPAD, WPAD = padding
+        pt, pb = HPAD, HPAD
+        pl, pr = WPAD, WPAD
     else:
-        HPAD, _, WPAD, _ = padding
+        pt, pl, pb, pr = padding
+
+    HPAD = pt + pb
+    WPAD = pl + pr
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        HSTR, WSTR = strides, strides
+        stride_h, stride_w = strides, strides
     else:
-        HSTR, WSTR = strides
+        stride_h, stride_w = strides
 
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = IH + 2 * HPAD
-    pad_width = IW + 2 * WPAD
+    pad_height = in_height + pt + pb
+    pad_width = in_width + pl + pr
 
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
-    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
 
-    G = groups
-    KPG = CO // G
-    CPG = CI // G
+    kernels_per_group = out_channel // groups
 
-    cfg.define_split("tile_ic", CI, num_outputs=2)
-    cfg.define_split("tile_oc", CO, num_outputs=2)
-    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((N, CI, IH, IW), dtype=data.dtype),
-                            te.placeholder((CO, CI // G, KH, KW),
+        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
                                            dtype=kernel.dtype),
                             strides, padding, groups, out_dtype)
 
@@ -152,43 +156,46 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     # pack data
     DOPAD = (HPAD != 0 or WPAD != 0)
     if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+        data_pad = pad(data,
+                       (0, 0, pt, pl),
+                       (0, 0, pb, pr),
+                       name="data_pad")
     else:
         data_pad = data
 
-    shape = (G, N, CPG // ic_bn,
+    shape = (groups, batch_size, kernel_depth // ic_bn,
              pad_height, ic_bn, pad_width)
 
     data_vec = te.compute(shape,
                           lambda g, n, C, h, c, w:
-                          data_pad[n, C * ic_bn + c + CPG * g, h, w],
+                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
                           name='data_vec')
 
     # pack kernel
-    shape = (G, KPG//oc_bn, CPG//ic_bn,
-             KH, KW, ic_bn, oc_bn)
+    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
+             k_height, k_width, ic_bn, oc_bn)
     kernel_vec = te.compute(shape,
-                            lambda g, CO, CI, h, w, ci, co:
-                            kernel[(CO * oc_bn + co + g * KPG),
-                                   CI * ic_bn + ci, h, w],
+                            lambda g, out_channel, in_channel, h, w, ci, co:
+                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
+                                   in_channel * ic_bn + ci, h, w],
                             name='kernel_vec')
 
     # convolution
-    oshape = (G, N, KPG//oc_bn,
-              OH, OW, oc_bn)
-    unpack_shape = (N, CO, OH, OW)
+    oshape = (groups, batch_size, kernels_per_group//oc_bn,
+              out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, out_channel, out_height, out_width)
 
-    ic = te.reduce_axis((0, (CPG)), name='ic')
-    kh = te.reduce_axis((0, KH), name='kh')
-    kw = te.reduce_axis((0, KW), name='kw')
+    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
+    kh = te.reduce_axis((0, k_height), name='kh')
+    kw = te.reduce_axis((0, k_width), name='kw')
     idxmod = tvm.tir.indexmod
     idxdiv = tvm.tir.indexdiv
 
     conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
                       te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
-                                      oh*HSTR+kh*dilation_h,
+                                      oh * stride_h + kh * dilation_h,
                                       idxmod(ic, ic_bn),
-                                      ow*WSTR+kw*dilation_w].astype(out_dtype) *
+                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
                              kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
                                         kh, kw, idxmod(ic, ic_bn),
                                         oc_block].astype(out_dtype),
@@ -196,10 +203,10 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     unpack = te.compute(unpack_shape,
                         lambda n, c, h, w:
-                        conv[idxdiv(c, KPG), n,
-                             idxmod(idxdiv(c, oc_bn), (KPG // oc_bn)),
+                        conv[idxdiv(c, kernels_per_group), n,
+                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
                              h, w,
-                             idxmod(idxmod(c, oc_bn), KPG)]
+                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
                         .astype(out_dtype),
                         name='output_unpack',
                         tag='group_conv2d_nchw')
@@ -239,9 +246,6 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            _, c, h, w = get_const_tuple(data.shape)
-            _, x, kh, kw = get_const_tuple(kernel.shape)
-
             args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
                     output, outs[0]]
             schedule_conv_sp_grouped(*args)
@@ -253,8 +257,7 @@ def traverse(op):
 
 
 def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec,
-                             conv_out, output, last,
-                             **kwargs):
+                             conv_out, output, last):
     # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
                                       cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
@@ -264,13 +267,13 @@ def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec,
     HPAD, WPAD = padding
     DOPAD = (HPAD != 0 or WPAD != 0)
 
-    A, W = data, kernel_vec
+    _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
     if DOPAD:
         s[A0].compute_inline()
-    groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
+    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
     s[A1].parallel(parallel_axis)

From d7f6c2b06a6fe9523654bb02a21f550a4b237dc7 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Fri, 28 Aug 2020 13:44:47 +0100
Subject: [PATCH 06/28] Improve linting

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 64 +++++++++++++------------
 python/tvm/topi/x86/group_conv2d.py     | 52 ++++++++++----------
 2 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 6066c7060ca4..4f7c29005804 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -19,15 +19,16 @@
 import tvm
 from tvm import autotvm
 from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
 from ..util import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.conv2d import group_conv2d_nchw
 from ..nn.util import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
 
 
 def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
@@ -62,9 +63,9 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
 
 def _fallback_schedule(cfg, wkl):
     simd_width = 4 # assume ARM SIMD Width is 4
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     stride_w = wkl.wstride
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1
+    out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
     groups = wkl.groups
     kernels_per_group = wkl.out_filter // groups
     kernel_depth = wkl.in_filter // groups
@@ -100,6 +101,10 @@ def _fallback_schedule(cfg, wkl):
 @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu")
 def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
                                    dilation, groups, out_dtype='float32'):
+    """
+    Compute group conv2d with NCHW layout, using GSPC algorithm.
+    https://arxiv.org/abs/2006.09791
+    """
     assert isinstance(dilation, int) or len(dilation) == 2
     if isinstance(dilation, int):
         dilation_h, dilation_w = dilation, dilation
@@ -108,16 +113,16 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pt, pl, pb, pr = padding, padding, padding, padding
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
     elif len(padding) == 2:
-        HPAD, WPAD = padding
-        pt, pb = HPAD, HPAD
-        pl, pr = WPAD, WPAD
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
     else:
-        pt, pl, pb, pr = padding
+        pad_top, pad_left, pad_bottom, pad_right = padding
 
-    HPAD = pt + pb
-    WPAD = pl + pr
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
@@ -128,13 +133,13 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
     out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pt + pb
-    pad_width = in_width + pl + pr
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
 
     dilated_kernel_h = (k_height - 1) * dilation_h + 1
     dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
 
     kernels_per_group = out_channel // groups
 
@@ -153,11 +158,11 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     oc_bn = cfg['tile_oc'].size[-1]
     ic_bn = cfg['tile_ic'].size[-1]
     # pack data
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
+    do_pad = (hpad != 0 or wpad != 0)
+    if do_pad:
         data_pad = pad(data,
-                       (0, 0, pt, pl),
-                       (0, 0, pb, pr),
+                       (0, 0, pad_top, pad_left),
+                       (0, 0, pad_bottom, pad_right),
                        name="data_pad")
     else:
         data_pad = data
@@ -245,12 +250,9 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            _, c, h, w = get_const_tuple(data.shape)
-            _, x, kh, kw = get_const_tuple(kernel.shape)
-
             args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
                     output, outs[0]]
-            schedule_conv_sp_grouped(*args)
+            _schedule_gspc_nchw(*args)
 
         scheduled_ops.append(op)
 
@@ -258,25 +260,25 @@ def traverse(op):
     return s
 
 
-def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec,
-                             conv_out, output, last,
-                             **kwargs):
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
+                        conv_out, output, last):
+    """Schedule GSPC"""
     # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
                                       cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    HPAD, WPAD = padding
-    DOPAD = (HPAD != 0 or WPAD != 0)
+    hpad, wpad = padding
+    do_pad = (hpad != 0 or wpad != 0)
 
-    A, W = data, kernel_vec
+    _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if DOPAD:
+    if do_pad:
         s[A0].compute_inline()
-    groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
+    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
     s[A1].parallel(parallel_axis)
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index e56d2db912f8..0f8426296cc9 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -26,7 +26,6 @@
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.conv2d import group_conv2d_nchw
 from ..nn.util import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
@@ -63,9 +62,9 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     stride_w = wkl.wstride
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // stride_w + 1
+    out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
     groups = wkl.groups
     kernels_per_group = wkl.out_filter // groups
     kernel_depth = wkl.in_filter // groups
@@ -101,6 +100,10 @@ def _fallback_schedule(cfg, wkl):
 @autotvm.register_topi_compute("group_conv2d_nchw.x86")
 def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
                                    dilation, groups, out_dtype='float32'):
+    """
+    Compute group conv2d with NCHW layout, using GSPC algorithm.
+    https://arxiv.org/abs/2006.09791
+    """
     assert isinstance(dilation, int) or len(dilation) == 2
     if isinstance(dilation, int):
         dilation_h, dilation_w = dilation, dilation
@@ -109,16 +112,16 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pt, pl, pb, pr = padding, padding, padding, padding
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
     elif len(padding) == 2:
-        HPAD, WPAD = padding
-        pt, pb = HPAD, HPAD
-        pl, pr = WPAD, WPAD
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
     else:
-        pt, pl, pb, pr = padding
+        pad_top, pad_left, pad_bottom, pad_right = padding
 
-    HPAD = pt + pb
-    WPAD = pl + pr
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
@@ -129,13 +132,13 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
     out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pt + pb
-    pad_width = in_width + pl + pr
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
 
     dilated_kernel_h = (k_height - 1) * dilation_h + 1
     dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
 
     kernels_per_group = out_channel // groups
 
@@ -154,11 +157,11 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     oc_bn = cfg['tile_oc'].size[-1]
     ic_bn = cfg['tile_ic'].size[-1]
     # pack data
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
+    do_pad = (hpad != 0 or wpad != 0)
+    if do_pad:
         data_pad = pad(data,
-                       (0, 0, pt, pl),
-                       (0, 0, pb, pr),
+                       (0, 0, pad_top, pad_left),
+                       (0, 0, pad_bottom, pad_right),
                        name="data_pad")
     else:
         data_pad = data
@@ -248,7 +251,7 @@ def traverse(op):
 
             args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
                     output, outs[0]]
-            schedule_conv_sp_grouped(*args)
+            _schedule_gspc_nchw(*args)
 
         scheduled_ops.append(op)
 
@@ -256,22 +259,23 @@ def traverse(op):
     return s
 
 
-def schedule_conv_sp_grouped(s, cfg, data, data_pad, data_vec, kernel_vec,
-                             conv_out, output, last):
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
+                  conv_out, output, last):
+    """Schedule GSPC"""
     # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
                                       cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    HPAD, WPAD = padding
-    DOPAD = (HPAD != 0 or WPAD != 0)
+    hpad, wpad = padding
+    do_pad = (hpad != 0 or wpad != 0)
 
     _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if DOPAD:
+    if do_pad:
         s[A0].compute_inline()
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 

From 323f8e5e80f63322a02f5d760a046a8eefe6b092 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Mon, 31 Aug 2020 09:51:13 +0100
Subject: [PATCH 07/28] Better linting, disable final linting checks

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 16 ++++++++-------
 python/tvm/topi/x86/group_conv2d.py     | 26 ++++++++++++++++---------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 4f7c29005804..db56ae0cdab9 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
 """Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
 
 import tvm
@@ -43,8 +45,8 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
-                        layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups,
+                        out_dtype, layout='NCHW'):
     """
     Get default schedule config for the workload
     """
@@ -63,7 +65,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
 
 def _fallback_schedule(cfg, wkl):
     simd_width = 4 # assume ARM SIMD Width is 4
-    pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    pad_left, pad_right = wkl.padl, wkl.padr
     stride_w = wkl.wstride
     out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
     groups = wkl.groups
@@ -158,8 +160,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     oc_bn = cfg['tile_oc'].size[-1]
     ic_bn = cfg['tile_ic'].size[-1]
     # pack data
-    do_pad = (hpad != 0 or wpad != 0)
-    if do_pad:
+    DOPAD = (hpad != 0 or wpad != 0)
+    if DOPAD:
         data_pad = pad(data,
                        (0, 0, pad_top, pad_left),
                        (0, 0, pad_bottom, pad_right),
@@ -270,13 +272,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
     hpad, wpad = padding
-    do_pad = (hpad != 0 or wpad != 0)
+    DOPAD = (hpad != 0 or wpad != 0)
 
     _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if do_pad:
+    if DOPAD:
         s[A0].compute_inline()
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 0f8426296cc9..377e480482fd 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
 """Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
 
 import tvm
@@ -42,8 +44,8 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
-                        layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups,
+                        out_dtype, layout='NCHW'):
     """
     Get default schedule config for the workload
     """
@@ -62,7 +64,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    pad_top, pad_left, pad_bottom, pad_right = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    pad_left, pad_right = wkl.padl, wkl.padr
     stride_w = wkl.wstride
     out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
     groups = wkl.groups
@@ -149,7 +151,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
+                                                dtype=data.dtype),
                             te.placeholder((out_channel, in_channel // groups, k_height, k_width),
                                            dtype=kernel.dtype),
                             strides, padding, groups, out_dtype)
@@ -157,8 +160,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
     oc_bn = cfg['tile_oc'].size[-1]
     ic_bn = cfg['tile_ic'].size[-1]
     # pack data
-    do_pad = (hpad != 0 or wpad != 0)
-    if do_pad:
+    DOPAD = (hpad != 0 or wpad != 0)
+    if DOPAD:
         data_pad = pad(data,
                        (0, 0, pad_top, pad_left),
                        (0, 0, pad_bottom, pad_right),
@@ -260,7 +263,7 @@ def traverse(op):
 
 
 def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
-                  conv_out, output, last):
+                        conv_out, output, last):
     """Schedule GSPC"""
     # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
@@ -269,14 +272,16 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
     hpad, wpad = padding
-    do_pad = (hpad != 0 or wpad != 0)
+    DOPAD = (hpad != 0 or wpad != 0)
 
     _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if do_pad:
+    if DOPAD:
         s[A0].compute_inline()
+        # s[A0].compute_at(s[A1])
+
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
@@ -319,8 +324,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
                       ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
+    # s[A1].compute_at(CC, parallel_axis)
     s[CC].parallel(parallel_axis)
 
+
+
     s[CC].vectorize(oc_block)
 
     s[CC].unroll(ow_block)

From 55ad3d72b01135b673b8f808540c551dea0bece9 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Thu, 3 Sep 2020 19:56:36 +0100
Subject: [PATCH 08/28] Fixed final linting errors (figured out how to run lint
 tests locally)

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 4 ++--
 python/tvm/topi/x86/group_conv2d.py     | 6 +-----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index db56ae0cdab9..f62d330cab30 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -152,7 +152,8 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
+                                                dtype=data.dtype),
                             te.placeholder((out_channel, in_channel // groups, k_height, k_width),
                                            dtype=kernel.dtype),
                             strides, padding, groups, out_dtype)
@@ -265,7 +266,6 @@ def traverse(op):
 def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
                         conv_out, output, last):
     """Schedule GSPC"""
-    # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
                                       cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 377e480482fd..ffef5099b304 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -265,7 +265,6 @@ def traverse(op):
 def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
                         conv_out, output, last):
     """Schedule GSPC"""
-    # fetch schedule
     ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
                                       cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
@@ -280,7 +279,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
     # schedule data
     if DOPAD:
         s[A0].compute_inline()
-        # s[A0].compute_at(s[A1])
 
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
@@ -324,10 +322,8 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
                       ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
-    # s[A1].compute_at(CC, parallel_axis)
-    s[CC].parallel(parallel_axis)
-
 
+    s[CC].parallel(parallel_axis)
 
     s[CC].vectorize(oc_block)
 

From 0722844a6b53d35aa7db13923116edd090566de5 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Mon, 21 Dec 2020 19:31:13 +0000
Subject: [PATCH 09/28] fixing linter formatting part 1

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 179 ++++++++++++++----------
 python/tvm/topi/x86/group_conv2d.py     | 172 +++++++++++++----------
 2 files changed, 205 insertions(+), 146 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index f62d330cab30..7de0dfa300b8 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -32,12 +32,11 @@
 
 
 
-
-def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
-                      out_dtype):
+def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
     """Compute group_conv2d with NCHW layout"""
-    return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding,
-                                          dilation, groups, out_dtype)
+    return group_conv2d_nchw_spatial_pack(
+        data, kernel, strides, padding, dilation, groups, out_dtype
+    )
 
 
 def schedule_group_conv2d_nchw(outs):
@@ -45,8 +44,7 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups,
-                        out_dtype, layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'):
     """
     Get default schedule config for the workload
     """
@@ -58,13 +56,12 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups,
             static_data_shape.append(dim)
     data = te.placeholder(static_data_shape, dtype=data.dtype)
 
-    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype,
-                               layout, asymmetric_pad=True)
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
     _fallback_schedule(cfg, wkl)
 
 
 def _fallback_schedule(cfg, wkl):
-    simd_width = 4 # assume ARM SIMD Width is 4
+    simd_width = 4  # assume ARM SIMD Width is 4
     pad_left, pad_right = wkl.padl, wkl.padr
     stride_w = wkl.wstride
     out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
@@ -101,8 +98,9 @@ def _fallback_schedule(cfg, wkl):
 
 
 @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu")
-def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
-                                   dilation, groups, out_dtype='float32'):
+def group_conv2d_nchw_spatial_pack(
+        cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'
+):
     """
     Compute group conv2d with NCHW layout, using GSPC algorithm.
     https://arxiv.org/abs/2006.09791
@@ -152,71 +150,101 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
-                                                dtype=data.dtype),
-                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
-                                           dtype=kernel.dtype),
-                            strides, padding, groups, out_dtype)
-
-    oc_bn = cfg['tile_oc'].size[-1]
-    ic_bn = cfg['tile_ic'].size[-1]
+        _get_default_config(
+            cfg,
+            te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+            te.placeholder(
+                (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype
+            ),
+            strides,
+            padding,
+            groups,
+            out_dtype,
+        )
+
+    oc_bn = cfg["tile_oc"].size[-1]
+    ic_bn = cfg["tile_ic"].size[-1]
+
     # pack data
-    DOPAD = (hpad != 0 or wpad != 0)
+    DOPAD = hpad != 0 or wpad != 0
     if DOPAD:
-        data_pad = pad(data,
-                       (0, 0, pad_top, pad_left),
-                       (0, 0, pad_bottom, pad_right),
-                       name="data_pad")
+        data_pad = pad(
+            data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad"
+        )
     else:
         data_pad = data
 
-    shape = (groups, batch_size, kernel_depth // ic_bn,
-             pad_height, ic_bn, pad_width)
+    shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width)
 
-    data_vec = te.compute(shape,
-                          lambda g, n, C, h, c, w:
-                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
-                          name='data_vec')
+    data_vec = te.compute(
+        shape,
+        lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+        name="data_vec",
+    )
 
     # pack kernel
-    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
-             k_height, k_width, ic_bn, oc_bn)
-    kernel_vec = te.compute(shape,
-                            lambda g, out_channel, in_channel, h, w, ci, co:
-                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
-                                   in_channel * ic_bn + ci, h, w],
-                            name='kernel_vec')
+    shape = (
+        groups,
+        kernels_per_group // oc_bn,
+        kernel_depth // ic_bn,
+        k_height,
+        k_width,
+        ic_bn,
+        oc_bn,
+    )
+
+    kernel_vec = te.compute(
+        shape,
+        lambda g, out_channel, in_channel, h, w, ci, co: kernel[
+            (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w
+        ],
+        name="kernel_vec",
+    )
 
     # convolution
-    oshape = (groups, batch_size, kernels_per_group//oc_bn,
-              out_height, out_width, oc_bn)
+    oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn)
     unpack_shape = (batch_size, out_channel, out_height, out_width)
 
-    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
-    kh = te.reduce_axis((0, k_height), name='kh')
-    kw = te.reduce_axis((0, k_width), name='kw')
+    ic = te.reduce_axis((0, (kernel_depth)), name="ic")
+    kh = te.reduce_axis((0, k_height), name="kh")
+    kw = te.reduce_axis((0, k_width), name="kw")
+
     idxmod = tvm.tir.indexmod
     idxdiv = tvm.tir.indexdiv
 
-    conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
-                      te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
-                                      oh * stride_h + kh * dilation_h,
-                                      idxmod(ic, ic_bn),
-                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
-                             kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
-                                        kh, kw, idxmod(ic, ic_bn),
-                                        oc_block].astype(out_dtype),
-                             axis=[ic, kh, kw]), name='conv')
-
-    unpack = te.compute(unpack_shape,
-                        lambda n, c, h, w:
-                        conv[idxdiv(c, kernels_per_group), n,
-                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
-                             h, w,
-                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
-                        .astype(out_dtype),
-                        name='output_unpack',
-                        tag='group_conv2d_nchw')
+    conv = te.compute(
+        oshape,
+        lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(
+            data_vec[
+                g,
+                n,
+                idxdiv(ic, ic_bn),
+                oh * stride_h + kh * dilation_h,
+                idxmod(ic, ic_bn),
+                ow * stride_w + kw * dilation_w,
+            ].astype(out_dtype)
+            * kernel_vec[
+                g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block
+            ].astype(out_dtype),
+            axis=[ic, kh, kw],
+        ),
+        name="conv",
+    )
+
+    unpack = te.compute(
+        unpack_shape,
+        lambda n, c, h, w: conv[
+            idxdiv(c, kernels_per_group),
+            n,
+            idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+            h,
+            w,
+            idxmod(idxmod(c, oc_bn), kernels_per_group),
+        ].astype(out_dtype),
+        name="output_unpack",
+        tag="group_conv2d_nchw",
+    )
+
     return unpack
 
 
@@ -236,7 +264,7 @@ def traverse(op):
                 if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
-        if 'group_conv2d_nchw' in op.tag:
+        if "group_conv2d_nchw" in op.tag:
             output = op.output(0)
 
             if "tile_ic" not in cfg:
@@ -253,8 +281,7 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
-                    output, outs[0]]
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
             _schedule_gspc_nchw(*args)
 
         scheduled_ops.append(op)
@@ -263,16 +290,20 @@ def traverse(op):
     return s
 
 
-def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
-                        conv_out, output, last):
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
     """Schedule GSPC"""
-    ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
-                                      cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
+    ic_bn, oc_bn, reg_n, unroll_kw = (
+        cfg["tile_ic"].size[-1],
+        cfg["tile_oc"].size[-1],
+        cfg["tile_ow"].size[-1],
+        cfg["unroll_kw"].val,
+    )
+
 
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
     hpad, wpad = padding
-    DOPAD = (hpad != 0 or wpad != 0)
+    DOPAD = hpad != 0 or wpad != 0
 
     _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
@@ -297,7 +328,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
 
     # schedule conv
     C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, 'global')
+    CC = s.cache_write(C, "global")
 
     _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
 
@@ -314,12 +345,10 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
     ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
     if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw,
-                      ow_block, oc_block)
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
         s[CC].unroll(kw)
     else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block,
-                      ow_block, oc_block)
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
     s[CC].parallel(parallel_axis)
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index ffef5099b304..c6b808ac9534 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -32,11 +32,11 @@
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
 
-def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups,
-                      out_dtype):
+def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
     """Compute group_conv2d with NCHW layout"""
-    return group_conv2d_nchw_spatial_pack(data, kernel, strides, padding,
-                                          dilation, groups, out_dtype)
+    return group_conv2d_nchw_spatial_pack(
+        data, kernel, strides, padding, dilation, groups, out_dtype
+    )
 
 
 def schedule_group_conv2d_nchw(outs):
@@ -44,8 +44,7 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups,
-                        out_dtype, layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'):
     """
     Get default schedule config for the workload
     """
@@ -57,8 +56,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups,
             static_data_shape.append(dim)
     data = te.placeholder(static_data_shape, dtype=data.dtype)
 
-    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype,
-                               layout, asymmetric_pad=True)
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
     _fallback_schedule(cfg, wkl)
 
 
@@ -100,8 +98,9 @@ def _fallback_schedule(cfg, wkl):
 
 
 @autotvm.register_topi_compute("group_conv2d_nchw.x86")
-def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
-                                   dilation, groups, out_dtype='float32'):
+def group_conv2d_nchw_spatial_pack(
+        cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'
+):
     """
     Compute group conv2d with NCHW layout, using GSPC algorithm.
     https://arxiv.org/abs/2006.09791
@@ -151,71 +150,100 @@ def group_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding,
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, te.placeholder((batch_size, in_channel, in_height, in_width),
-                                                dtype=data.dtype),
-                            te.placeholder((out_channel, in_channel // groups, k_height, k_width),
-                                           dtype=kernel.dtype),
-                            strides, padding, groups, out_dtype)
-
-    oc_bn = cfg['tile_oc'].size[-1]
-    ic_bn = cfg['tile_ic'].size[-1]
+        _get_default_config(
+            cfg,
+            te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+            te.placeholder(
+                (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype
+            ),
+            strides,
+            padding,
+            groups,
+            out_dtype,
+        )
+
+    oc_bn = cfg["tile_oc"].size[-1]
+    ic_bn = cfg["tile_ic"].size[-1]
+
     # pack data
-    DOPAD = (hpad != 0 or wpad != 0)
+    DOPAD = hpad != 0 or wpad != 0
     if DOPAD:
-        data_pad = pad(data,
-                       (0, 0, pad_top, pad_left),
-                       (0, 0, pad_bottom, pad_right),
-                       name="data_pad")
+        data_pad = pad(
+            data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad"
+        )
     else:
         data_pad = data
 
-    shape = (groups, batch_size, kernel_depth // ic_bn,
-             pad_height, ic_bn, pad_width)
+    shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width)
 
-    data_vec = te.compute(shape,
-                          lambda g, n, C, h, c, w:
-                          data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
-                          name='data_vec')
+    data_vec = te.compute(
+        shape,
+        lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+        name="data_vec",
+    )
 
     # pack kernel
-    shape = (groups, kernels_per_group//oc_bn, kernel_depth//ic_bn,
-             k_height, k_width, ic_bn, oc_bn)
-    kernel_vec = te.compute(shape,
-                            lambda g, out_channel, in_channel, h, w, ci, co:
-                            kernel[(out_channel * oc_bn + co + g * kernels_per_group),
-                                   in_channel * ic_bn + ci, h, w],
-                            name='kernel_vec')
+    shape = (
+        groups,
+        kernels_per_group // oc_bn,
+        kernel_depth // ic_bn,
+        k_height,
+        k_width,
+        ic_bn,
+        oc_bn,
+    )
+
+    kernel_vec = te.compute(
+        shape,
+        lambda g, out_channel, in_channel, h, w, ci, co: kernel[
+            (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w
+        ],
+        name="kernel_vec",
+    )
 
     # convolution
-    oshape = (groups, batch_size, kernels_per_group//oc_bn,
-              out_height, out_width, oc_bn)
+    oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn)
     unpack_shape = (batch_size, out_channel, out_height, out_width)
 
-    ic = te.reduce_axis((0, (kernel_depth)), name='ic')
-    kh = te.reduce_axis((0, k_height), name='kh')
-    kw = te.reduce_axis((0, k_width), name='kw')
+    ic = te.reduce_axis((0, (kernel_depth)), name="ic")
+    kh = te.reduce_axis((0, k_height), name="kh")
+    kw = te.reduce_axis((0, k_width), name="kw")
+
     idxmod = tvm.tir.indexmod
     idxdiv = tvm.tir.indexdiv
+    conv = te.compute(
+        oshape,
+        lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(
+            data_vec[
+                g,
+                n,
+                idxdiv(ic, ic_bn),
+                oh * stride_h + kh * dilation_h,
+                idxmod(ic, ic_bn),
+                ow * stride_w + kw * dilation_w,
+            ].astype(out_dtype)
+            * kernel_vec[
+                g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block
+            ].astype(out_dtype),
+            axis=[ic, kh, kw],
+        ),
+        name="conv",
+    )
+
+    unpack = te.compute(
+        unpack_shape,
+        lambda n, c, h, w: conv[
+            idxdiv(c, kernels_per_group),
+            n,
+            idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+            h,
+            w,
+            idxmod(idxmod(c, oc_bn), kernels_per_group),
+        ].astype(out_dtype),
+        name="output_unpack",
+        tag="group_conv2d_nchw",
+    )
 
-    conv = te.compute(oshape, lambda g, n, oc_chunk, oh, ow, oc_block:
-                      te.sum(data_vec[g, n, idxdiv(ic, ic_bn),
-                                      oh * stride_h + kh * dilation_h,
-                                      idxmod(ic, ic_bn),
-                                      ow * stride_w + kw * dilation_w].astype(out_dtype) *
-                             kernel_vec[g, oc_chunk, idxdiv(ic, ic_bn),
-                                        kh, kw, idxmod(ic, ic_bn),
-                                        oc_block].astype(out_dtype),
-                             axis=[ic, kh, kw]), name='conv')
-
-    unpack = te.compute(unpack_shape,
-                        lambda n, c, h, w:
-                        conv[idxdiv(c, kernels_per_group), n,
-                             idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
-                             h, w,
-                             idxmod(idxmod(c, oc_bn), kernels_per_group)]
-                        .astype(out_dtype),
-                        name='output_unpack',
-                        tag='group_conv2d_nchw')
     return unpack
 
 
@@ -235,7 +263,7 @@ def traverse(op):
                 if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
-        if 'group_conv2d_nchw' in op.tag:
+        if "group_conv2d_nchw" in op.tag:
             output = op.output(0)
 
             if "tile_ic" not in cfg:
@@ -252,8 +280,7 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
-                    output, outs[0]]
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
             _schedule_gspc_nchw(*args)
 
         scheduled_ops.append(op)
@@ -265,13 +292,18 @@ def traverse(op):
 def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
                         conv_out, output, last):
     """Schedule GSPC"""
-    ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
-                                      cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
+    ic_bn, oc_bn, reg_n, unroll_kw = (
+        cfg["tile_ic"].size[-1],
+        cfg["tile_oc"].size[-1],
+        cfg["tile_ow"].size[-1],
+        cfg["unroll_kw"].val,
+    )
+
 
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
     hpad, wpad = padding
-    DOPAD = (hpad != 0 or wpad != 0)
+    DOPAD = hpad != 0 or wpad != 0
 
     _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
@@ -297,7 +329,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
 
     # schedule conv
     C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, 'global')
+    CC = s.cache_write(C, "global")
 
     _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
 
@@ -314,12 +346,10 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
     ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
     if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw,
-                      ow_block, oc_block)
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
         s[CC].unroll(kw)
     else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block,
-                      ow_block, oc_block)
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
 

From c9460efe4a55d7d60555d5f613ce1a5d0797f32b Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Mon, 21 Dec 2020 19:40:24 +0000
Subject: [PATCH 10/28] fixing linter formatting part 2

---
 python/tvm/relay/op/strategy/arm_cpu.py | 2 +-
 python/tvm/topi/x86/group_conv2d.py     | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 36200bb024a9..005eae68b8b7 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -210,7 +210,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True),
                 wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.arm_cpu"
+                name="group_conv2d_nchw.arm_cpu",
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index c6b808ac9534..790f15a2949d 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -44,7 +44,7 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"):
     """
     Get default schedule config for the workload
     """
@@ -99,7 +99,7 @@ def _fallback_schedule(cfg, wkl):
 
 @autotvm.register_topi_compute("group_conv2d_nchw.x86")
 def group_conv2d_nchw_spatial_pack(
-        cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'
+        cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
 ):
     """
     Compute group conv2d with NCHW layout, using GSPC algorithm.
@@ -202,7 +202,7 @@ def group_conv2d_nchw_spatial_pack(
     )
 
     # convolution
-    oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn)
+    oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn)
     unpack_shape = (batch_size, out_channel, out_height, out_width)
 
     ic = te.reduce_axis((0, (kernel_depth)), name="ic")
@@ -299,7 +299,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
         cfg["unroll_kw"].val,
     )
 
-
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
     hpad, wpad = padding

From b41a8687bc2e9c992afd52e4539d9c5635b7a169 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Mon, 21 Dec 2020 19:45:41 +0000
Subject: [PATCH 11/28] fixing linter formatting part 3

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 8 +++-----
 python/tvm/topi/x86/group_conv2d.py     | 5 ++---
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 7de0dfa300b8..a8fb4c6920a9 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -31,7 +31,6 @@
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
 
-
 def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
     """Compute group_conv2d with NCHW layout"""
     return group_conv2d_nchw_spatial_pack(
@@ -44,7 +43,7 @@ def schedule_group_conv2d_nchw(outs):
     return schedule_group_conv2d_nchwc(outs)
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout='NCHW'):
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"):
     """
     Get default schedule config for the workload
     """
@@ -99,7 +98,7 @@ def _fallback_schedule(cfg, wkl):
 
 @autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu")
 def group_conv2d_nchw_spatial_pack(
-        cfg, data, kernel, strides, padding, dilation, groups, out_dtype='float32'
+    cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
 ):
     """
     Compute group conv2d with NCHW layout, using GSPC algorithm.
@@ -202,7 +201,7 @@ def group_conv2d_nchw_spatial_pack(
     )
 
     # convolution
-    oshape = (groups, batch_size, kernels_per_group//oc_bn, out_height, out_width, oc_bn)
+    oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn)
     unpack_shape = (batch_size, out_channel, out_height, out_width)
 
     ic = te.reduce_axis((0, (kernel_depth)), name="ic")
@@ -299,7 +298,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         cfg["unroll_kw"].val,
     )
 
-
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
     hpad, wpad = padding
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 790f15a2949d..8d946ba0dd98 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -99,7 +99,7 @@ def _fallback_schedule(cfg, wkl):
 
 @autotvm.register_topi_compute("group_conv2d_nchw.x86")
 def group_conv2d_nchw_spatial_pack(
-        cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
+    cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
 ):
     """
     Compute group conv2d with NCHW layout, using GSPC algorithm.
@@ -289,8 +289,7 @@ def traverse(op):
     return s
 
 
-def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec,
-                        conv_out, output, last):
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
     """Schedule GSPC"""
     ic_bn, oc_bn, reg_n, unroll_kw = (
         cfg["tile_ic"].size[-1],

From 1adf616183cc2a3c333f99698b9d02b0aeaaf84e Mon Sep 17 00:00:00 2001
From: Wheest <Wheest@users.noreply.github.com>
Date: Mon, 21 Dec 2020 20:06:54 +0000
Subject: [PATCH 12/28] Update conv2d.py

Fixed merge issue
---
 python/tvm/topi/nn/conv2d.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 80f87f86736c..a00df1fdbf0f 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -174,10 +174,14 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou
     else:
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
+<<<<<<< HEAD
     pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
     dilation_h, dilation_w = (
         dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     )
+=======
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
+>>>>>>> c57bd780a (Update conv2d.py)
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride

From 0a26e5c39a76365279741411dbc355d206ce5feb Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Wed, 6 Jan 2021 18:25:53 +0000
Subject: [PATCH 13/28] Rebase, and update responding to some comments

---
 python/tvm/relay/op/strategy/x86.py     | 10 +++++-----
 python/tvm/topi/arm_cpu/group_conv2d.py |  8 ++++----
 python/tvm/topi/nn/conv2d.py            |  4 ----
 python/tvm/topi/x86/group_conv2d.py     | 10 +++++-----
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 60bd92ef63d1..98eb28ee0ef3 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("group_conv2d is not optimized for x86 with autotvm.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
-                name="group_conv2d_nhwc.generic",
-            )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
+                    wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
+                    name="group_conv2d_nhwc.generic",
+                )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index a8fb4c6920a9..99f4d5c946d6 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -23,11 +23,11 @@
 from tvm import te
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.util import infer_pad
+from ..nn.utils import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
 
@@ -62,8 +62,8 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
 def _fallback_schedule(cfg, wkl):
     simd_width = 4  # assume ARM SIMD Width is 4
     pad_left, pad_right = wkl.padl, wkl.padr
-    stride_w = wkl.wstride
-    out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
+    stride_w = wkl.stride_w
+    out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1
     groups = wkl.groups
     kernels_per_group = wkl.out_filter // groups
     kernel_depth = wkl.in_filter // groups
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index a00df1fdbf0f..80f87f86736c 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -174,14 +174,10 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou
     else:
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
-<<<<<<< HEAD
     pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
     dilation_h, dilation_w = (
         dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     )
-=======
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
->>>>>>> c57bd780a (Update conv2d.py)
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 8d946ba0dd98..02dc058b39cc 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -23,12 +23,12 @@
 from tvm import te
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
-from .util import get_fp32_len
-from ..util import get_const_tuple
+from .utils import get_fp32_len
+from ..utils import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.util import infer_pad
+from ..nn.utils import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
 
@@ -63,8 +63,8 @@ def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype,
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
     pad_left, pad_right = wkl.padl, wkl.padr
-    stride_w = wkl.wstride
-    out_width = (wkl.width + pad_left + pad_right - wkl.wkernel) // stride_w + 1
+    stride_w = wkl.stride_w
+    out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1
     groups = wkl.groups
     kernels_per_group = wkl.out_filter // groups
     kernel_depth = wkl.in_filter // groups

From c79a7cac301409ca63a56a6f1db4b8893c5f95d1 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Thu, 18 Mar 2021 15:37:15 +0000
Subject: [PATCH 14/28] Fixed AutoScheduler bug for NHWC case

---
 python/tvm/relay/op/strategy/x86.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 98eb28ee0ef3..60bd92ef63d1 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("group_conv2d is not optimized for x86 with autotvm.")
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
-                    wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
-                    name="group_conv2d_nhwc.generic",
-                )
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
+                name="group_conv2d_nhwc.generic",
+            )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy

From bd3237234d89726d80f5cede7c6ffbf7d6171ab9 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Fri, 19 Mar 2021 18:08:16 +0000
Subject: [PATCH 15/28] removed infer_pad from GSPC

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 9 ++-------
 python/tvm/topi/x86/group_conv2d.py     | 9 +--------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 99f4d5c946d6..024b649d9b23 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -27,7 +27,6 @@
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.utils import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
 
@@ -298,17 +297,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         cfg["unroll_kw"].val,
     )
 
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    hpad, wpad = padding
-    DOPAD = hpad != 0 or wpad != 0
-
     _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if DOPAD:
+    if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
+
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 02dc058b39cc..160c6dfb9cae 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -28,7 +28,6 @@
 from ..nn.pad import pad
 from .. import tag
 
-from ..nn.utils import infer_pad
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
 
@@ -275,7 +274,6 @@ def traverse(op):
                 s[kernel].compute_inline()
             data_vec = conv_out.op.input_tensors[0]
             data = data_vec.op.input_tensors[0]
-            data_pad = None
             if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
@@ -298,16 +296,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         cfg["unroll_kw"].val,
     )
 
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    hpad, wpad = padding
-    DOPAD = hpad != 0 or wpad != 0
-
     _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if DOPAD:
+    if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
 
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis

From a3f9e510d3ef3b16524ce7a8647fd0debe4130f4 Mon Sep 17 00:00:00 2001
From: Wheest <perry@gibsonic.org>
Date: Sun, 26 Jul 2020 15:16:05 +0100
Subject: [PATCH 16/28] Rebase, and undoing accidental removal of auto
 scheduler NHWC support

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 76 +++++++---------------
 python/tvm/topi/x86/group_conv2d.py     | 84 +++++++++----------------
 2 files changed, 52 insertions(+), 108 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 024b649d9b23..11bff7ddd5fe 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -1,29 +1,7 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
-
 import tvm
 from tvm import autotvm
 from tvm import te
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
-from ..utils import get_const_tuple
+from ..util import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
@@ -68,20 +46,17 @@ def _fallback_schedule(cfg, wkl):
     kernel_depth = wkl.in_filter // groups
 
     oc_bn = 1
+
     for bn in range(simd_width, 0, -1):
-        if kernels_per_group % bn == 0:
+        if KPG % bn == 0:
             oc_bn = bn
             break
-    if oc_bn > kernels_per_group:
-        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if kernel_depth % bn == 0:
+        if CPG % bn == 0:
             ic_bn = bn
             break
-    if ic_bn > kernel_depth:
-        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -111,39 +86,36 @@ def group_conv2d_nchw_spatial_pack(
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+        HPAD, WPAD = padding, padding
     elif len(padding) == 2:
-        hpad, wpad = padding
-        pad_top, pad_bottom = hpad, hpad
-        pad_left, pad_right = wpad, wpad
+        HPAD, WPAD = padding
     else:
-        pad_top, pad_left, pad_bottom, pad_right = padding
-
-    hpad = pad_top + pad_bottom
-    wpad = pad_left + pad_right
+        HPAD, _, WPAD, _ = padding
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        stride_h, stride_w = strides, strides
+        HSTR, WSTR = strides, strides
     else:
-        stride_h, stride_w = strides
+        HSTR, WSTR = strides
 
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pad_top + pad_bottom
-    pad_width = in_width + pad_left + pad_right
+    pad_height = IH + 2 * HPAD
+    pad_width = IW + 2 * WPAD
 
-    dilated_kernel_h = (k_height - 1) * dilation_h + 1
-    dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
+    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
 
-    kernels_per_group = out_channel // groups
+    G = groups
+    KPG = CO // G
+    CPG = CI // G
 
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", CI, num_outputs=2)
+    cfg.define_split("tile_oc", CO, num_outputs=2)
+    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
@@ -297,7 +269,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         cfg["unroll_kw"].val,
     )
 
-    _, W = data, kernel_vec
+    A, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 160c6dfb9cae..c5a6475ec913 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -1,35 +1,15 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
-# pylint: disable=no-value-for-parameter,import-outside-toplevel
-"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
-
 import tvm
 from tvm import autotvm
 from tvm import te
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
-from .utils import get_fp32_len
-from ..utils import get_const_tuple
+from .util import get_fp32_len
+from ..util import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
 
 def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
     """Compute group_conv2d with NCHW layout"""
@@ -69,20 +49,17 @@ def _fallback_schedule(cfg, wkl):
     kernel_depth = wkl.in_filter // groups
 
     oc_bn = 1
+
     for bn in range(simd_width, 0, -1):
-        if kernels_per_group % bn == 0:
+        if KPG % bn == 0:
             oc_bn = bn
             break
-    if oc_bn > kernels_per_group:
-        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if kernel_depth % bn == 0:
+        if CPG % bn == 0:
             ic_bn = bn
             break
-    if ic_bn > kernel_depth:
-        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -112,39 +89,36 @@ def group_conv2d_nchw_spatial_pack(
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+        HPAD, WPAD = padding, padding
     elif len(padding) == 2:
-        hpad, wpad = padding
-        pad_top, pad_bottom = hpad, hpad
-        pad_left, pad_right = wpad, wpad
+        HPAD, WPAD = padding
     else:
-        pad_top, pad_left, pad_bottom, pad_right = padding
-
-    hpad = pad_top + pad_bottom
-    wpad = pad_left + pad_right
+        HPAD, _, WPAD, _ = padding
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        stride_h, stride_w = strides, strides
+        HSTR, WSTR = strides, strides
     else:
-        stride_h, stride_w = strides
+        HSTR, WSTR = strides
 
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pad_top + pad_bottom
-    pad_width = in_width + pad_left + pad_right
+    pad_height = IH + 2 * HPAD
+    pad_width = IW + 2 * WPAD
 
-    dilated_kernel_h = (k_height - 1) * dilation_h + 1
-    dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
+    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
 
-    kernels_per_group = out_channel // groups
+    G = groups
+    KPG = CO // G
+    CPG = CI // G
 
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", CI, num_outputs=2)
+    cfg.define_split("tile_oc", CO, num_outputs=2)
+    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
@@ -296,14 +270,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         cfg["unroll_kw"].val,
     )
 
-    _, W = data, kernel_vec
+    A, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
     if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
-
-    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
+    groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
     s[A1].parallel(parallel_axis)
@@ -343,7 +316,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
-
     s[CC].parallel(parallel_axis)
 
     s[CC].vectorize(oc_block)

From e741ca730f6678a163f5c3ba8ba87b238f6d36c4 Mon Sep 17 00:00:00 2001
From: Perry Gibson <Perry.Gibson@glasgow.ac.uk>
Date: Thu, 27 Aug 2020 17:01:36 +0100
Subject: [PATCH 17/28] Added ASF license header

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 17 +++++++++++++++++
 python/tvm/topi/x86/group_conv2d.py     | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 11bff7ddd5fe..de7e16f5cd16 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import tvm
 from tvm import autotvm
 from tvm import te
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index c5a6475ec913..d4bd62352a80 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import tvm
 from tvm import autotvm
 from tvm import te

From 36f912856df0d1e0d5ef5c96a0b52e7630fa96af Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Thu, 27 Aug 2020 17:44:40 +0100
Subject: [PATCH 18/28] Minor bug fixes

---
 python/tvm/topi/x86/group_conv2d.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index d4bd62352a80..d0b25cc83e74 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -18,6 +18,8 @@
 import tvm
 from tvm import autotvm
 from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
 from .util import get_fp32_len
 from ..util import get_const_tuple
 from ..nn.pad import pad
@@ -25,8 +27,6 @@
 
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-
 
 def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
     """Compute group_conv2d with NCHW layout"""
@@ -67,16 +67,21 @@ def _fallback_schedule(cfg, wkl):
 
     oc_bn = 1
 
+    oc_bn = 1
     for bn in range(simd_width, 0, -1):
         if KPG % bn == 0:
             oc_bn = bn
             break
+    if oc_bn > KPG:
+        oc_bn = KPG
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
         if CPG % bn == 0:
             ic_bn = bn
             break
+    if ic_bn > CPG:
+        ic_bn = CPG
 
     reg_n = 1
     for n in range(31, 0, -1):

From 12de6b8ba9a1b8594c6eb7b86de952f0ea8624d8 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Fri, 28 Aug 2020 13:19:23 +0100
Subject: [PATCH 19/28] Added asymmetric padding support Fixed linting

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 49 +++++++++++++---------
 python/tvm/topi/x86/group_conv2d.py     | 56 +++++++++++++------------
 2 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index de7e16f5cd16..5bdc47cad01c 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
 
 import tvm
 from tvm import autotvm
@@ -64,16 +65,21 @@ def _fallback_schedule(cfg, wkl):
 
     oc_bn = 1
 
+    oc_bn = 1
     for bn in range(simd_width, 0, -1):
-        if KPG % bn == 0:
+        if kernels_per_group % bn == 0:
             oc_bn = bn
             break
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if CPG % bn == 0:
+        if kernel_depth % bn == 0:
             ic_bn = bn
             break
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -103,36 +109,39 @@ def group_conv2d_nchw_spatial_pack(
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        HPAD, WPAD = padding, padding
+        pt, pl, pb, pr = padding, padding, padding, padding
     elif len(padding) == 2:
         HPAD, WPAD = padding
+        pt, pb = HPAD, HPAD
+        pl, pr = WPAD, WPAD
     else:
-        HPAD, _, WPAD, _ = padding
+        pt, pl, pb, pr = padding
+
+    HPAD = pt + pb
+    WPAD = pl + pr
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        HSTR, WSTR = strides, strides
+        stride_h, stride_w = strides, strides
     else:
-        HSTR, WSTR = strides
+        stride_h, stride_w = strides
 
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = IH + 2 * HPAD
-    pad_width = IW + 2 * WPAD
+    pad_height = in_height + pt + pb
+    pad_width = in_width + pl + pr
 
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
-    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
 
-    G = groups
-    KPG = CO // G
-    CPG = CI // G
+    kernels_per_group = out_channel // groups
 
-    cfg.define_split("tile_ic", CI, num_outputs=2)
-    cfg.define_split("tile_oc", CO, num_outputs=2)
-    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index d0b25cc83e74..9eeba9f09fe3 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
 
 import tvm
 from tvm import autotvm
@@ -69,19 +70,19 @@ def _fallback_schedule(cfg, wkl):
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
-        if KPG % bn == 0:
+        if kernels_per_group % bn == 0:
             oc_bn = bn
             break
-    if oc_bn > KPG:
-        oc_bn = KPG
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
 
     ic_bn = 1
     for bn in range(oc_bn, 0, -1):
-        if CPG % bn == 0:
+        if kernel_depth % bn == 0:
             ic_bn = bn
             break
-    if ic_bn > CPG:
-        ic_bn = CPG
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
 
     reg_n = 1
     for n in range(31, 0, -1):
@@ -111,36 +112,39 @@ def group_conv2d_nchw_spatial_pack(
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        HPAD, WPAD = padding, padding
+        pt, pl, pb, pr = padding, padding, padding, padding
     elif len(padding) == 2:
         HPAD, WPAD = padding
+        pt, pb = HPAD, HPAD
+        pl, pr = WPAD, WPAD
     else:
-        HPAD, _, WPAD, _ = padding
+        pt, pl, pb, pr = padding
+
+    HPAD = pt + pb
+    WPAD = pl + pr
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
-        HSTR, WSTR = strides, strides
+        stride_h, stride_w = strides, strides
     else:
-        HSTR, WSTR = strides
+        stride_h, stride_w = strides
 
-    N, CI, IH, IW = get_const_tuple(data.shape)
-    CO, CIG, KH, KW = get_const_tuple(kernel.shape)
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = IH + 2 * HPAD
-    pad_width = IW + 2 * WPAD
+    pad_height = in_height + pt + pb
+    pad_width = in_width + pl + pr
 
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-    OH = (IH + 2 * HPAD - dilated_kernel_h) // HSTR + 1
-    OW = (IW + 2 * WPAD - dilated_kernel_w) // WSTR + 1
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
 
-    G = groups
-    KPG = CO // G
-    CPG = CI // G
+    kernels_per_group = out_channel // groups
 
-    cfg.define_split("tile_ic", CI, num_outputs=2)
-    cfg.define_split("tile_oc", CO, num_outputs=2)
-    cfg.define_split("tile_ow", OW, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     cfg.define_knob("unroll_kw", [True, False])
 
     # If no config was set, we can fallback to default config.
@@ -292,13 +296,13 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         cfg["unroll_kw"].val,
     )
 
-    A, W = data, kernel_vec
+    _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
     if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
-    groups, batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
+    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
     s[A1].parallel(parallel_axis)

From ef858b4494009441e1c2623d4cb15aba2b2b97e4 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Fri, 28 Aug 2020 13:44:47 +0100
Subject: [PATCH 20/28] Improve linting

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 26 +++++++++++++------------
 python/tvm/topi/x86/group_conv2d.py     | 22 ++++++++++-----------
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 5bdc47cad01c..bdcba2c35ecb 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -19,6 +19,8 @@
 import tvm
 from tvm import autotvm
 from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
 from ..util import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
@@ -109,16 +111,16 @@ def group_conv2d_nchw_spatial_pack(
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pt, pl, pb, pr = padding, padding, padding, padding
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
     elif len(padding) == 2:
-        HPAD, WPAD = padding
-        pt, pb = HPAD, HPAD
-        pl, pr = WPAD, WPAD
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
     else:
-        pt, pl, pb, pr = padding
+        pad_top, pad_left, pad_bottom, pad_right = padding
 
-    HPAD = pt + pb
-    WPAD = pl + pr
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
@@ -129,13 +131,13 @@ def group_conv2d_nchw_spatial_pack(
     batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
     out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pt + pb
-    pad_width = in_width + pl + pr
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
 
     dilated_kernel_h = (k_height - 1) * dilation_h + 1
     dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
 
     kernels_per_group = out_channel // groups
 
@@ -295,7 +297,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         cfg["unroll_kw"].val,
     )
 
-    A, W = data, kernel_vec
+    _, W = data, kernel_vec
     A0, A1 = data_pad, data_vec
 
     # schedule data
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 9eeba9f09fe3..4019068d4d8a 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -112,16 +112,16 @@ def group_conv2d_nchw_spatial_pack(
 
     assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
     if isinstance(padding, int):
-        pt, pl, pb, pr = padding, padding, padding, padding
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
     elif len(padding) == 2:
-        HPAD, WPAD = padding
-        pt, pb = HPAD, HPAD
-        pl, pr = WPAD, WPAD
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
     else:
-        pt, pl, pb, pr = padding
+        pad_top, pad_left, pad_bottom, pad_right = padding
 
-    HPAD = pt + pb
-    WPAD = pl + pr
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
 
     assert isinstance(strides, int) or len(strides) == 2
     if isinstance(strides, int):
@@ -132,13 +132,13 @@ def group_conv2d_nchw_spatial_pack(
     batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
     out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
 
-    pad_height = in_height + pt + pb
-    pad_width = in_width + pl + pr
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
 
     dilated_kernel_h = (k_height - 1) * dilation_h + 1
     dilated_kernel_w = (k_width - 1) * dilation_w + 1
-    out_height = (in_height + pt + pb - dilated_kernel_h) // stride_h + 1
-    out_width = (in_width + pl + pr - dilated_kernel_w) // stride_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
 
     kernels_per_group = out_channel // groups
 

From 10d6e4018ecedc789989d712a6892103fb55a58c Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Mon, 31 Aug 2020 09:51:13 +0100
Subject: [PATCH 21/28] Better linting, disable final linting checks

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 2 ++
 python/tvm/topi/x86/group_conv2d.py     | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index bdcba2c35ecb..64f179ba5bbb 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
 """Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
 
 import tvm
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 4019068d4d8a..0f1b1a87ac94 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
 """Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
 
 import tvm
@@ -302,6 +304,8 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
     # schedule data
     if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
+        # s[A0].compute_at(s[A1])
+
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
     parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
@@ -342,8 +346,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
+    # s[A1].compute_at(CC, parallel_axis)
     s[CC].parallel(parallel_axis)
 
+
+
     s[CC].vectorize(oc_block)
 
     s[CC].unroll(ow_block)

From f468988cc5458db5e43de9fe76bcce856310b52d Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Thu, 3 Sep 2020 19:56:36 +0100
Subject: [PATCH 22/28] Fixed final linting errors (figured out how to run lint
 tests locally)

---
 python/tvm/topi/x86/group_conv2d.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 0f1b1a87ac94..c1f3ece0f323 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -304,7 +304,6 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
     # schedule data
     if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
-        # s[A0].compute_at(s[A1])
 
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
 
@@ -346,10 +345,8 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
 
     parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
-    # s[A1].compute_at(CC, parallel_axis)
-    s[CC].parallel(parallel_axis)
-
 
+    s[CC].parallel(parallel_axis)
 
     s[CC].vectorize(oc_block)
 

From 6fef83a0b64d21523c10e094506ad1c5ec209af1 Mon Sep 17 00:00:00 2001
From: Wheest <Wheest@users.noreply.github.com>
Date: Mon, 21 Dec 2020 20:06:54 +0000
Subject: [PATCH 23/28] Update conv2d.py

Fixed merge issue
---
 python/tvm/topi/nn/conv2d.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 80f87f86736c..a00df1fdbf0f 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -174,10 +174,14 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou
     else:
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
+<<<<<<< HEAD
     pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
     dilation_h, dilation_w = (
         dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     )
+=======
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
+>>>>>>> c57bd780a (Update conv2d.py)
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride

From cf87146ce075ed89a3763a4cff41d6834f514d5a Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Wed, 6 Jan 2021 18:25:53 +0000
Subject: [PATCH 24/28] Rebase, and update responding to some comments

---
 python/tvm/relay/op/strategy/x86.py     | 10 +++++-----
 python/tvm/topi/arm_cpu/group_conv2d.py |  2 +-
 python/tvm/topi/nn/conv2d.py            |  4 ----
 python/tvm/topi/x86/group_conv2d.py     |  4 ++--
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 60bd92ef63d1..98eb28ee0ef3 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("group_conv2d is not optimized for x86 with autotvm.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
-                name="group_conv2d_nhwc.generic",
-            )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
+                    wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
+                    name="group_conv2d_nhwc.generic",
+                )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 64f179ba5bbb..6a6e578803db 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -23,7 +23,7 @@
 from tvm import te
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index a00df1fdbf0f..80f87f86736c 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -174,14 +174,10 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou
     else:
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
-<<<<<<< HEAD
     pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
     dilation_h, dilation_w = (
         dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     )
-=======
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
->>>>>>> c57bd780a (Update conv2d.py)
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index c1f3ece0f323..7ac3a8928650 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -23,8 +23,8 @@
 from tvm import te
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
-from .util import get_fp32_len
-from ..util import get_const_tuple
+from .utils import get_fp32_len
+from ..utils import get_const_tuple
 from ..nn.pad import pad
 from .. import tag
 

From 8f74750853f1e6abe489c9f6c52fd4c126fe3b3d Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Thu, 18 Mar 2021 15:37:15 +0000
Subject: [PATCH 25/28] Fixed AutoScheduler bug for NHWC case

---
 python/tvm/relay/op/strategy/x86.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 98eb28ee0ef3..60bd92ef63d1 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -214,11 +214,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("group_conv2d is not optimized for x86 with autotvm.")
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
-                    wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
-                    name="group_conv2d_nhwc.generic",
-                )
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
+                name="group_conv2d_nhwc.generic",
+            )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy

From 53b4d5bd74a64b268d06026b3e91f5abd7254f4e Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Tue, 23 Mar 2021 14:19:33 +0000
Subject: [PATCH 26/28] Minor fix

---
 python/tvm/topi/x86/group_conv2d.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 7ac3a8928650..7e9c532fb15b 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -276,6 +276,7 @@ def traverse(op):
                 s[kernel].compute_inline()
             data_vec = conv_out.op.input_tensors[0]
             data = data_vec.op.input_tensors[0]
+            data_pad = None
             if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]

From fd23e11c2394f8abcf77e58f05985c3ab0149482 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Tue, 23 Mar 2021 16:54:05 +0000
Subject: [PATCH 27/28] Fixed removal of infer_pad to no padding

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 2 +-
 python/tvm/topi/x86/group_conv2d.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index 6a6e578803db..c7fad430836c 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -303,7 +303,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
+    if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
 
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index 7e9c532fb15b..a6ac7e5ac5a3 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -303,7 +303,7 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
+    if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
         s[A0].compute_inline()
 
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis

From 7a2c3caa472e0fceaa8bb10709883f30d23de328 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Tue, 23 Mar 2021 20:01:31 +0000
Subject: [PATCH 28/28] Fixed unexpected linting error

---
 python/tvm/topi/arm_cpu/group_conv2d.py | 6 +++++-
 python/tvm/topi/x86/group_conv2d.py     | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
index c7fad430836c..d852b9acef66 100644
--- a/python/tvm/topi/arm_cpu/group_conv2d.py
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -303,7 +303,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
+    if (
+        data_pad is not None
+        and isinstance(data_pad.op, tvm.te.ComputeOp)
+        and "pad" in data_pad.op.tag
+    ):
         s[A0].compute_inline()
 
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
index a6ac7e5ac5a3..0501c5534cf2 100644
--- a/python/tvm/topi/x86/group_conv2d.py
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -303,7 +303,11 @@ def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out,
     A0, A1 = data_pad, data_vec
 
     # schedule data
-    if data_pad is not None and isinstance(data_pad.op, tvm.te.ComputeOp) and "pad" in data_pad.op.tag:
+    if (
+        data_pad is not None
+        and isinstance(data_pad.op, tvm.te.ComputeOp)
+        and "pad" in data_pad.op.tag
+    ):
         s[A0].compute_inline()
 
     groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis