From 2408739dcff1f1fc41805a242e8ff767517047b2 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Mon, 8 Nov 2021 12:57:16 +0000
Subject: [PATCH 01/27] f wrong type check in conv2d_transpose

---
 src/relay/op/nn/convolution.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index c27227b2eb73..a46a6ddec76d 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -1053,18 +1053,18 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
   ICHECK(trans_in_layout.defined())
-      << "Conv only support input layouts that are convertible from NCHW."
+      << "Conv2DTransposed only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
   ICHECK(trans_kernel_layout.defined())
-      << "Conv only support kernel layouts that are convertible from OIHW."
+      << "Conv2DTransposed only support kernel layouts that are convertible from OIHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
   ICHECK(trans_out_layout.defined())
-      << "Conv only support output layouts that are convertible from NCHW."
+      << "Conv2DTransposed only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
@@ -1099,16 +1099,20 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
       // check the size
       ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
              reporter->AssertEQ(param->kernel_size[1], wshape[3]))
-          << "Conv2D: shape of weight is inconsistent with kernel_size, "
+          << "Conv2DTransposed: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      ICHECK(reporter->AssertEQ(param->channels, wshape[1]))
-          << "Conv2D: shape of weight is inconsistent with channels, "
+      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
+          << "Conv2DTransposed: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_nchw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      ICHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[0]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[1]))
+          << "Conv2DTransposed: data.shape[1] // groups != weight.shape[1], "
+          << " data.shape= " << Array<IndexExpr>(dshape_nchw)
+          << " groups= " << param->groups
+          << " weight.shape= " << Array<IndexExpr>(wshape);
     }
     channels = wshape[1];
     dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];

From 5dc34a92c06dc7c4d7ec00dbf2845130dc5ece7f Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Mon, 8 Nov 2021 13:22:25 +0000
Subject: [PATCH 02/27] add test case for conv2d transpose

---
 tests/python/relay/test_op_level2.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index da2877063c45..f9c912809208 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -858,6 +858,30 @@ def test_conv2d_transpose_nchw_run():
         )
         tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
+# @tvm.testing.uses_gpu
+@pytest.mark.skip(reason="Currently Conv2dTranpose only supports groups == 1 as in L284 in python/tvm/relay/op/strategy/x86.py")
+def test_conv2d_transpose_nchw_groups_run():
+    # Add groups testing
+    dshape = (1, 16, 18, 18)
+    kshape = (16, 1, 3, 3)
+    oshape = (1, 16, 36, 36)
+    x = relay.var("x", shape=dshape)
+    w = relay.var("w")
+    y = relay.nn.conv2d_transpose(
+        x, w, channels=10, kernel_size=(3, 3), strides=(2, 2), padding=(1, 1), output_padding=(1, 1), groups=16
+    )
+    func = relay.Function([x, w], y)
+    dtype = "float32"
+    data = np.random.uniform(size=dshape).astype(dtype)
+    kernel = np.random.uniform(size=kshape).astype(dtype)
+    ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(data, kernel, 2, 1, (1, 1))
+
+    for target, dev in tvm.testing.enabled_targets():
+        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            data, kernel
+        )
+        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
+
 
 @tvm.testing.uses_gpu
 def test_conv2d_transpose_nhwc_run():

From 2885a5760c98e0745e2b8f585d2f89acd7233481 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Mon, 8 Nov 2021 14:59:32 +0000
Subject: [PATCH 03/27] add groups support for conv2d_transpose

---
 python/tvm/topi/nn/conv2d_transpose.py | 88 ++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index 22188bcd45a4..699ca7e72187 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -16,6 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Transposed 2D convolution operators (sometimes called Deconvolution)."""
+import collections
+
 import tvm
 from tvm import te
 from tvm import relay
@@ -24,6 +26,18 @@
 from .utils import get_pad_tuple
 from ..utils import simplify
 
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            assert len(x) == n, f"Input can only have {n} elements, but got {len(x)} instead: {x}."
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
 
 def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype, output_padding):
     """Transposed 2D convolution nchw forward operator.
@@ -111,11 +125,85 @@ def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype,
             axis=[dc, dh, dw],
         ),
         tag="conv2d_transpose_nchw",
+
     )
 
     return Output
 
 
+def group_conv2d_transpose_nchw(data, kernel, strides=1, padding=0, output_padding=0, groups=1, dilation=1, out_dtype=None):
+    # some pre-processing and prelimnary checks 
+    if out_dtype is None:
+        out_dtype = data.dtype
+
+    # strides = _pair(strides)
+    # padding = _pair(padding)
+    # output_padding = _pair(output_padding)
+    # dilation = _pair(dilation)
+    batch, in_channels, in_height, in_width = data.shape
+    _, out_c, filter_h, filter_w = kernel.shape
+    assert in_channels % groups == 0, "input channels must divide group size"
+    # assert out_c % groups == 0, "output channels must divide group size"
+
+    batch, in_c, in_h, in_w = data.shape
+    _, out_c, filter_h, filter_w = kernel.shape
+    stride_h, stride_w = strides
+    opad_h, opad_w = output_padding
+    assert opad_h < stride_h and opad_w < stride_w, f"[{output_padding}] opad_h:{opad_h} < stride_h:{stride_h} and opad_w:{opad_w} < stride_w:{stride_w} does not satisfy."
+    # dilate data
+    data_dilate = dilate(data, [1, 1, stride_h, stride_w], name="data_dilate")
+    # pad data
+    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w))
+    bpad_top = filter_h - 1 - fpad_top
+    bpad_bottom = filter_h - 1 - fpad_bottom + opad_h
+    bpad_left = filter_w - 1 - fpad_left
+    bpad_right = filter_w - 1 - fpad_right + opad_w
+    data_pad = pad(
+        data_dilate, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right], name="data_pad"
+    )
+    # transform kernel layout from IOHW to OIHW, and rotate kernel by 180 degrees
+    kernel_transform = te.compute(
+        (out_c, in_c, filter_h, filter_w),
+        lambda i, o, h, w: kernel[o][i][filter_h - 1 - h][filter_w - 1 - w],
+        name="kernel_transform",
+    )
+
+    batch, in_c, in_h, in_w = data_pad.shape
+    out_c, _, filter_h, filter_w = kernel_transform.shape
+
+    # convolution stage
+    out_c = simplify(out_c)
+    out_channels = simplify(out_c * groups)
+
+    out_h = simplify(in_h - filter_h + 1)
+    out_w = simplify(in_w - filter_w + 1)
+    dc = te.reduce_axis((0, in_c // groups), name="dc")
+    dh = te.reduce_axis((0, filter_h), name="dh")
+    dw = te.reduce_axis((0, filter_w), name="dw")
+
+    # data: batch, in_channels, out_h, out_w
+    # weight: out_channels // G, in_channels, out_h, out_w
+    return te.compute(
+        (batch, out_channels, out_h, out_w),
+        lambda b, c, h, w: te.sum(
+            data_pad[
+                b, 
+                c // (out_channels // groups) * (in_channels // groups) + dc, 
+                h + dh, 
+                w + dw
+            ].astype(out_dtype)
+            * kernel_transform[
+                c % (out_channels // groups), 
+                c // (out_channels // groups) * (in_channels // groups) + dc, 
+                dh, 
+                dw
+            ].astype(out_dtype),
+            axis=[dc, dh, dw],
+        ),
+        tag="conv2d_transpose_nchw",
+    )
+
+
 @tvm.target.generic_func
 def conv2d_transpose_legalize(attrs, inputs, types):
     """Legalizes Transposed 2D convolution op.

From 8f5a97908d321103c2b4aa97bbbec6fb934a7b24 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Mon, 8 Nov 2021 15:00:40 +0000
Subject: [PATCH 04/27] add  naive implementation and schedule for conv2d with
 groups

---
 python/tvm/relay/op/strategy/arm_cpu.py | 20 +++++++++----
 python/tvm/relay/op/strategy/cuda.py    | 20 +++++++++----
 python/tvm/relay/op/strategy/generic.py | 37 +++++++++++++++++++++----
 python/tvm/relay/op/strategy/x86.py     | 21 ++++++++++----
 python/tvm/topi/generic/nn.py           | 17 ++++++++++++
 5 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 06dfc87038fe..7dc7806cc30e 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -373,13 +373,21 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
+    # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
-        name="conv2d_tranpose_nchw.arm_cpu",
-    )
+    if groups == 1:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
+            name="conv2d_tranpose_nchw.arm_cpu",
+        )
+    else:
+        # FIXME: Here should be a specialized implementation and schedule instead general one.
+        strategy.add_implementation(
+            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
+            name="group_conv2d_transpose_nchw.arm_cpu",
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 5f24dbda9d35..e139bb4d6c5e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -557,13 +557,21 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
+    # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
-        name="conv2d_transpose_nchw.cuda",
-    )
+    if groups == 1:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
+            name="conv2d_transpose_nchw.cuda",
+        )
+    else:
+        # FIXME: Here should be a specialized implementation and schedule instead general one.
+        strategy.add_implementation(
+            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
+            name="group_conv2d_transpose_nchw.cuda",
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 777f17ba6084..6b35a87f1ad0 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -461,6 +461,26 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype):
 
     return compute_conv2d_transpose
 
+# FIXME: This is a temporal fix for groups > 1 in Conv2dTranspose. This should be merged with above functions in the future.
+def wrap_compute_group_conv2d_transpose(topi_compute):
+    """wrap conv2d_transpose topi compute"""
+    def compute_conv2d_transpose(attrs, inputs, out_dtype):
+        """Compute definition of conv2d_transpose"""
+        padding = get_const_tuple(attrs.padding)
+        strides = get_const_tuple(attrs.strides)
+        out_dtype = attrs.out_dtype
+        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
+        groups = attrs.groups
+        dilation = attrs.dilation
+        output_padding = get_const_tuple(attrs.output_padding)
+        out = topi_compute(inputs[0], inputs[1], strides, padding, 
+                            output_padding, groups, dilation, out_dtype)
+        return [out]
+
+    return compute_conv2d_transpose
+
+
+
 
 @override_native_generic_func("conv2d_transpose_strategy")
 def conv2d_transpose_strategy(attrs, inputs, out_type, target):
@@ -473,11 +493,18 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target):
     assert dilation == (1, 1), "not support dilate now"
     assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
-        name="conv2d_transpose_nchw.generic",
-    )
+    if groups == 1:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
+            name="conv2d_transpose_nchw.generic",
+        )
+    else:
+        strategy.add_implementation(
+            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw), 
+            name="group_conv2d_transpose_nchw.generic",
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 1c8d1b478cb1..99a80942ceb7 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -281,13 +281,22 @@ def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
+    # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
-        name="conv2d_transpose_nchw.x86",
-    )
+
+    if groups == 1:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
+            name="conv2d_transpose_nchw.x86",
+        )
+    else:
+        strategy.add_implementation(
+            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
+            # FIXME: Here should be a manually written schedule instead default one.
+            name="group_conv2d_transpose_nchw.x86",
+        )
     return strategy
 
 
diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index 1b3214154687..22a90aa2cd07 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -428,6 +428,23 @@ def schedule_group_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
+def schedule_group_conv2d_transpose_nchw(outs):
+    """Schedule for group_conv2d_transpose_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of group_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 def schedule_group_conv2d_nhwc(outs):
     """Schedule for group_conv2d_nhwc
 

From 9bea044ba5b54eaa8c283d004d24d8e9b76f10f6 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Mon, 8 Nov 2021 16:02:38 +0000
Subject: [PATCH 05/27] enable tests for cpu and arm_cpu, raise error for cuda
 platform

---
 python/tvm/relay/op/strategy/cuda.py |  3 ++-
 tests/python/relay/test_op_level2.py | 14 ++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index e139bb4d6c5e..05bb26ecc399 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -566,10 +566,11 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
             name="conv2d_transpose_nchw.cuda",
         )
     else:
+        raise NotImplementedError("CUDA schedule is not enable for conv2d transpose when groups > 1. See https://github.com/apache/tvm/pull/9465 for details.")
         # FIXME: Here should be a specialized implementation and schedule instead general one.
         strategy.add_implementation(
             wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.cuda.schedule_group_conv2d_transpose_nchw),
             name="group_conv2d_transpose_nchw.cuda",
         )
     return strategy
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index f9c912809208..ecd479d53e42 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -858,8 +858,8 @@ def test_conv2d_transpose_nchw_run():
         )
         tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
-# @tvm.testing.uses_gpu
-@pytest.mark.skip(reason="Currently Conv2dTranpose only supports groups == 1 as in L284 in python/tvm/relay/op/strategy/x86.py")
+
+@tvm.testing.uses_gpu
 def test_conv2d_transpose_nchw_groups_run():
     # Add groups testing
     dshape = (1, 16, 18, 18)
@@ -868,19 +868,21 @@ def test_conv2d_transpose_nchw_groups_run():
     x = relay.var("x", shape=dshape)
     w = relay.var("w")
     y = relay.nn.conv2d_transpose(
-        x, w, channels=10, kernel_size=(3, 3), strides=(2, 2), padding=(1, 1), output_padding=(1, 1), groups=16
+        x, w, channels=16, kernel_size=(3, 3), strides=(2, 2), padding=(1, 1), output_padding=(1, 1), groups=16
     )
     func = relay.Function([x, w], y)
     dtype = "float32"
     data = np.random.uniform(size=dshape).astype(dtype)
     kernel = np.random.uniform(size=kshape).astype(dtype)
-    ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(data, kernel, 2, 1, (1, 1))
-
+    # ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(data, kernel, 2, 1, (1, 1))
+    # FIXME: the testing.conv2d_transpose does not support groups yet. Currently only test whether it is exectuable
     for target, dev in tvm.testing.enabled_targets():
+        if "cuda" in target:
+            continue
         op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
             data, kernel
         )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
+        # tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-7, atol=1e-7)
 
 
 @tvm.testing.uses_gpu

From a1c7308668ca66931681b05e533ae2b46e4b8c2e Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 17:17:16 +0000
Subject: [PATCH 06/27] revert the cuda and generic strategy

---
 python/tvm/relay/op/strategy/cuda.py    | 23 +++++----------
 python/tvm/relay/op/strategy/generic.py | 39 ++++---------------------
 2 files changed, 13 insertions(+), 49 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 05bb26ecc399..2c33837de89e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -557,22 +557,13 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    # assert groups == 1, "only support groups == 1 for now"
+    assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.cuda",
-        )
-    else:
-        raise NotImplementedError("CUDA schedule is not enable for conv2d transpose when groups > 1. See https://github.com/apache/tvm/pull/9465 for details.")
-        # FIXME: Here should be a specialized implementation and schedule instead general one.
-        strategy.add_implementation(
-            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.cuda.schedule_group_conv2d_transpose_nchw),
-            name="group_conv2d_transpose_nchw.cuda",
-        )
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
+        name="conv2d_transpose_nchw.cuda",
+    )
     return strategy
 
 
@@ -1248,4 +1239,4 @@ def einsum_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.generic.schedule_extern),
         name="einsum.cuda",
     )
-    return strategy
+    return strategy
\ No newline at end of file
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 6b35a87f1ad0..8ef54ceaca2e 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -461,26 +461,6 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype):
 
     return compute_conv2d_transpose
 
-# FIXME: This is a temporal fix for groups > 1 in Conv2dTranspose. This should be merged with above functions in the future.
-def wrap_compute_group_conv2d_transpose(topi_compute):
-    """wrap conv2d_transpose topi compute"""
-    def compute_conv2d_transpose(attrs, inputs, out_dtype):
-        """Compute definition of conv2d_transpose"""
-        padding = get_const_tuple(attrs.padding)
-        strides = get_const_tuple(attrs.strides)
-        out_dtype = attrs.out_dtype
-        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-        groups = attrs.groups
-        dilation = attrs.dilation
-        output_padding = get_const_tuple(attrs.output_padding)
-        out = topi_compute(inputs[0], inputs[1], strides, padding, 
-                            output_padding, groups, dilation, out_dtype)
-        return [out]
-
-    return compute_conv2d_transpose
-
-
-
 
 @override_native_generic_func("conv2d_transpose_strategy")
 def conv2d_transpose_strategy(attrs, inputs, out_type, target):
@@ -493,18 +473,11 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target):
     assert dilation == (1, 1), "not support dilate now"
     assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.generic",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw), 
-            name="group_conv2d_transpose_nchw.generic",
-        )
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
+        name="conv2d_transpose_nchw.generic",
+    )
     return strategy
 
 
@@ -1757,4 +1730,4 @@ def einsum_strategy(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.generic.schedule_einsum),
         name="einsum.generic",
     )
-    return strategy
+    return strategy
\ No newline at end of file

From fcc2f00d54a97bad166a8a04c5edd49974acd517 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 17:42:36 +0000
Subject: [PATCH 07/27] revert back the x86 strategy

---
 python/tvm/relay/op/strategy/x86.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 99a80942ceb7..8e032d6e8115 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -281,22 +281,13 @@ def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    # assert groups == 1, "only support groups == 1 for now"
+    assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
-            name="conv2d_transpose_nchw.x86",
-        )
-    else:
-        strategy.add_implementation(
-            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
-            # FIXME: Here should be a manually written schedule instead default one.
-            name="group_conv2d_transpose_nchw.x86",
-        )
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
+        name="conv2d_transpose_nchw.x86",
+    )
     return strategy
 
 
@@ -691,4 +682,4 @@ def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_typ
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
         )
-    return strategy
+    return strategy
\ No newline at end of file

From 59e807baf7fa9fdacbfc3510115c9f12120081aa Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 17:44:33 +0000
Subject: [PATCH 08/27] revert back the arm_cpu strategy

---
 python/tvm/relay/op/strategy/arm_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 7dc7806cc30e..10db6cc86324 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -422,4 +422,4 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense),
         name="bitserial_dense.arm_cpu",
     )
-    return strategy
+    return strategy
\ No newline at end of file

From 3eacec4070f55bd456ef2e86c7daeaf0005c6012 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 17:46:09 +0000
Subject: [PATCH 09/27] revert back the arm_cpu strategy

---
 python/tvm/relay/op/strategy/arm_cpu.py | 2 +-
 python/tvm/relay/op/strategy/cuda.py    | 2 +-
 python/tvm/relay/op/strategy/generic.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 10db6cc86324..7dc7806cc30e 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -422,4 +422,4 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense),
         name="bitserial_dense.arm_cpu",
     )
-    return strategy
\ No newline at end of file
+    return strategy
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 2c33837de89e..5f24dbda9d35 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1239,4 +1239,4 @@ def einsum_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.generic.schedule_extern),
         name="einsum.cuda",
     )
-    return strategy
\ No newline at end of file
+    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 8ef54ceaca2e..777f17ba6084 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1730,4 +1730,4 @@ def einsum_strategy(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.generic.schedule_einsum),
         name="einsum.generic",
     )
-    return strategy
\ No newline at end of file
+    return strategy

From 478ff9aad244d20cb241a9c902ebb4b99df145e7 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 17:47:43 +0000
Subject: [PATCH 10/27] revert back the arm_cpu strategy

---
 python/tvm/relay/op/strategy/arm_cpu.py | 20 ++++++--------------
 src/relay/op/nn/convolution.h           | 13 +++++++------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 7dc7806cc30e..06dfc87038fe 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -373,21 +373,13 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    # assert groups == 1, "only support groups == 1 for now"
+    assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    if groups == 1:
-        strategy.add_implementation(
-            wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
-            name="conv2d_tranpose_nchw.arm_cpu",
-        )
-    else:
-        # FIXME: Here should be a specialized implementation and schedule instead general one.
-        strategy.add_implementation(
-            wrap_compute_group_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw),
-            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
-            name="group_conv2d_transpose_nchw.arm_cpu",
-        )
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
+        name="conv2d_tranpose_nchw.arm_cpu",
+    )
     return strategy
 
 
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index a46a6ddec76d..717cfa426a61 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -1044,7 +1044,7 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
-  static const Layout kOIHW("OIHW");
+  static const Layout kOIHW("OIHW"); // FIXME: weight layout should be IOHW as discussed in https://github.com/apache/tvm/pull/9336
 
   const Conv2DTransposeAttrs* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);
@@ -1103,13 +1103,14 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
-          << "Conv2DTransposed: shape of weight is inconsistent with channels, "
-          << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
+      ICHECK(reporter->AssertEQ(indexdiv(param->channels, param->groups), wshape[1]))
+          << "Conv2DTransposed: shape of weight is inconsistent with out_channels, "
+          << " out_channels // groups != weigt.shape[1] "
+          << " out_channels=" << param->channels << " weigt.shape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_nchw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      ICHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[1]))
-          << "Conv2DTransposed: data.shape[1] // groups != weight.shape[1], "
+      ICHECK(reporter->AssertEQ(dshape_nchw[1],  wshape[0]))
+          << "Conv2DTransposed: shape of weight is inconsistent with in_channels."
           << " data.shape= " << Array<IndexExpr>(dshape_nchw)
           << " groups= " << param->groups
           << " weight.shape= " << Array<IndexExpr>(wshape);

From 11c2c7564ff2d49625845a988cba506a1c034fcf Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 17:49:11 +0000
Subject: [PATCH 11/27] fix EOF of x86

---
 python/tvm/relay/op/strategy/x86.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 8e032d6e8115..1c8d1b478cb1 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -682,4 +682,4 @@ def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_typ
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
         )
-    return strategy
\ No newline at end of file
+    return strategy

From 064ee1a4ae97fc4640f39a6066c19d09f1883bc2 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 17:52:42 +0000
Subject: [PATCH 12/27] clang lint updated c++ code

---
 src/relay/op/nn/convolution.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 717cfa426a61..70f480e8d8cf 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -1105,8 +1105,8 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     if (param->channels.defined()) {
       ICHECK(reporter->AssertEQ(indexdiv(param->channels, param->groups), wshape[1]))
           << "Conv2DTransposed: shape of weight is inconsistent with out_channels, "
-          << " out_channels // groups != weigt.shape[1] "
-          << " out_channels=" << param->channels << " weigt.shape=" << Array<IndexExpr>(wshape);
+          << " out_channels // groups != weight.shape[1] "
+          << " out_channels=" << param->channels << " weight.shape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_nchw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
       ICHECK(reporter->AssertEQ(dshape_nchw[1],  wshape[0]))

From efea36a171c8bf6fbe3873e369d11d00af3a52f9 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 18:07:10 +0000
Subject: [PATCH 13/27] update topi implementation

---
 python/tvm/topi/nn/conv2d_transpose.py | 61 +++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index 699ca7e72187..652a6a8f5c99 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -39,6 +39,7 @@ def parse(x):
 _triple = _ntuple(3)
 _quadruple = _ntuple(4)
 
+
 def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype, output_padding):
     """Transposed 2D convolution nchw forward operator.
 
@@ -131,25 +132,63 @@ def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype,
     return Output
 
 
-def group_conv2d_transpose_nchw(data, kernel, strides=1, padding=0, output_padding=0, groups=1, dilation=1, out_dtype=None):
+def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output_padding, groups):
+    """Group convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+    
+    kernel : tvm.te.Tensor
+        4-D with shape [in_channel, out_channel // groups, filter_height, filter_width]
+    
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+    
+    padding : int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+    
+    out_dtype : str
+        The output data type. This is used for mixed precision.
+    
+    output_padding : tuple of ints
+        Used to get the right output shape for gradients
+    
+    groups : int
+        number of groups
+    
+    out_dtype : str
+        The output type. This is used for mixed precision.
+    
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    if groups == 1:
+        return conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output_padding)
+    
     # some pre-processing and prelimnary checks 
     if out_dtype is None:
         out_dtype = data.dtype
 
-    # strides = _pair(strides)
+    batch, in_c, in_h, in_w = data.shape
+    _, out_c, filter_h, filter_w = kernel.shape
+    # assert in_channels % groups == 0, "input channels must divide group size"
+    assert out_c % groups == 0, "output channels must divide group size"
+
+    strides = _pair(strides)
     # padding = _pair(padding)
     # output_padding = _pair(output_padding)
     # dilation = _pair(dilation)
-    batch, in_channels, in_height, in_width = data.shape
-    _, out_c, filter_h, filter_w = kernel.shape
-    assert in_channels % groups == 0, "input channels must divide group size"
-    # assert out_c % groups == 0, "output channels must divide group size"
-
-    batch, in_c, in_h, in_w = data.shape
-    _, out_c, filter_h, filter_w = kernel.shape
+    
     stride_h, stride_w = strides
     opad_h, opad_w = output_padding
-    assert opad_h < stride_h and opad_w < stride_w, f"[{output_padding}] opad_h:{opad_h} < stride_h:{stride_h} and opad_w:{opad_w} < stride_w:{stride_w} does not satisfy."
+    assert opad_h < stride_h and opad_w < stride_w, \
+        f"[{output_padding}] opad_h:{opad_h} < stride_h:{stride_h} and opad_w:{opad_w} < stride_w:{stride_w} does not satisfy."
     # dilate data
     data_dilate = dilate(data, [1, 1, stride_h, stride_w], name="data_dilate")
     # pad data
@@ -200,7 +239,7 @@ def group_conv2d_transpose_nchw(data, kernel, strides=1, padding=0, output_paddi
             ].astype(out_dtype),
             axis=[dc, dh, dw],
         ),
-        tag="conv2d_transpose_nchw",
+        tag="group_conv2d_transpose_nchw",
     )
 
 

From 7e212001c931592fbb57219902523f5eea644df0 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 18:08:13 +0000
Subject: [PATCH 14/27] Revert test

---
 tests/python/relay/test_op_level2.py | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index ecd479d53e42..bb74795def7d 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -859,32 +859,6 @@ def test_conv2d_transpose_nchw_run():
         tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
-@tvm.testing.uses_gpu
-def test_conv2d_transpose_nchw_groups_run():
-    # Add groups testing
-    dshape = (1, 16, 18, 18)
-    kshape = (16, 1, 3, 3)
-    oshape = (1, 16, 36, 36)
-    x = relay.var("x", shape=dshape)
-    w = relay.var("w")
-    y = relay.nn.conv2d_transpose(
-        x, w, channels=16, kernel_size=(3, 3), strides=(2, 2), padding=(1, 1), output_padding=(1, 1), groups=16
-    )
-    func = relay.Function([x, w], y)
-    dtype = "float32"
-    data = np.random.uniform(size=dshape).astype(dtype)
-    kernel = np.random.uniform(size=kshape).astype(dtype)
-    # ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(data, kernel, 2, 1, (1, 1))
-    # FIXME: the testing.conv2d_transpose does not support groups yet. Currently only test whether it is exectuable
-    for target, dev in tvm.testing.enabled_targets():
-        if "cuda" in target:
-            continue
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            data, kernel
-        )
-        # tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-7, atol=1e-7)
-
-
 @tvm.testing.uses_gpu
 def test_conv2d_transpose_nhwc_run():
     dshape_nhwc = (1, 18, 18, 3)
@@ -1913,4 +1887,4 @@ def _test_correlation(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    sys.exit(pytest.main(sys.argv))
\ No newline at end of file

From 14c771bd9a592481c8561b6e51aaefac1d3a6fbe Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 18:08:36 +0000
Subject: [PATCH 15/27] Revert test

---
 tests/python/relay/test_op_level2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index bb74795def7d..da2877063c45 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1887,4 +1887,4 @@ def _test_correlation(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
\ No newline at end of file
+    sys.exit(pytest.main(sys.argv))

From 8d48f22eca1875ca8098bd5136f9722252e057d9 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 19:28:59 +0000
Subject: [PATCH 16/27] add generic/x86/arm specialization for conv2d_transpose
 with groups > 1

---
 python/tvm/relay/op/strategy/cuda.py    |  2 +-
 python/tvm/relay/op/strategy/generic.py | 32 ++++++++++++++++++-------
 python/tvm/relay/op/strategy/x86.py     | 24 ++++++++++++++-----
 python/tvm/topi/nn/conv2d_transpose.py  | 16 ++++++-------
 4 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 5f24dbda9d35..eee5d9a685b3 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -557,7 +557,7 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
+    assert groups == 1, "only support groups == 1 when targetting cuda/gpu"
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 777f17ba6084..a82b4c8f708a 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -446,7 +446,7 @@ def deformable_conv2d_strategy(attrs, inputs, out_type, target):
 
 
 # conv2d_transpose
-def wrap_compute_conv2d_transpose(topi_compute):
+def wrap_compute_conv2d_transpose(topi_compute, has_groups=False):
     """wrap conv2d_transpose topi compute"""
 
     def compute_conv2d_transpose(attrs, inputs, out_dtype):
@@ -456,7 +456,11 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype):
         out_dtype = attrs.out_dtype
         out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
         output_padding = get_const_tuple(attrs.output_padding)
-        out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype, output_padding)
+        # out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype, output_padding)
+        args = [inputs[0], inputs[1], strides, padding, out_dtype, output_padding]
+        if has_groups:
+            args.append(attrs.groups)
+        out = topi_compute(*args)
         return [out]
 
     return compute_conv2d_transpose
@@ -471,13 +475,25 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
+    # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
-        name="conv2d_transpose_nchw.generic",
-    )
+    # strategy.add_implementation(
+    #     wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
+    #     wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
+    #     name="conv2d_transpose_nchw.generic",
+    # )
+    if groups == 1:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
+            name="conv2d_transpose_nchw.generic",
+        )
+    else:  # group_transpose_conv2d
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw, has_groups=True),
+            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
+            name="group_conv2d_transpose_nchw.generic",
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 1c8d1b478cb1..b25cf35b2000 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -281,13 +281,25 @@ def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
+    # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
-        wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
-        name="conv2d_transpose_nchw.x86",
-    )
+    # strategy.add_implementation(
+    #     wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
+    #     wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
+    #     name="conv2d_transpose_nchw.x86",
+    # )
+    if groups == 1:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
+            name="conv2d_transpose_nchw.x86",
+        )
+    else:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.nn.group_conv2d_transpose_nchw, has_groups=True),
+            wrap_topi_schedule(topi.generic.schedule_group_conv2d_transpose_nchw),
+            name="group_conv2d_transpose_nchw.x86",
+        )
     return strategy
 
 
diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index 652a6a8f5c99..8173b6c002ef 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -126,7 +126,6 @@ def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype,
             axis=[dc, dh, dw],
         ),
         tag="conv2d_transpose_nchw",
-
     )
 
     return Output
@@ -175,12 +174,12 @@ def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output
     if out_dtype is None:
         out_dtype = data.dtype
 
-    batch, in_c, in_h, in_w = data.shape
+    batch, in_channels, in_h, in_w = data.shape
     _, out_c, filter_h, filter_w = kernel.shape
-    # assert in_channels % groups == 0, "input channels must divide group size"
-    assert out_c % groups == 0, "output channels must divide group size"
+    assert in_channels % groups == 0, f"input channels {in_channels} must divide group size {groups}"
+    # assert out_c % groups == 0, f"output channels {in_c} must divide group size {groups}"
 
-    strides = _pair(strides)
+    strides = _pair(stride)
     # padding = _pair(padding)
     # output_padding = _pair(output_padding)
     # dilation = _pair(dilation)
@@ -202,21 +201,20 @@ def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output
     )
     # transform kernel layout from IOHW to OIHW, and rotate kernel by 180 degrees
     kernel_transform = te.compute(
-        (out_c, in_c, filter_h, filter_w),
+        (out_c, in_channels, filter_h, filter_w),
         lambda i, o, h, w: kernel[o][i][filter_h - 1 - h][filter_w - 1 - w],
         name="kernel_transform",
     )
 
-    batch, in_c, in_h, in_w = data_pad.shape
+    batch, in_channels, in_h, in_w = data_pad.shape
     out_c, _, filter_h, filter_w = kernel_transform.shape
 
     # convolution stage
-    out_c = simplify(out_c)
     out_channels = simplify(out_c * groups)
 
     out_h = simplify(in_h - filter_h + 1)
     out_w = simplify(in_w - filter_w + 1)
-    dc = te.reduce_axis((0, in_c // groups), name="dc")
+    dc = te.reduce_axis((0, in_channels // groups), name="dc")
     dh = te.reduce_axis((0, filter_h), name="dh")
     dw = te.reduce_axis((0, filter_w), name="dw")
 

From 2d766f2117cbe68c5e7396c3a2219dc39559ed53 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 19:32:26 +0000
Subject: [PATCH 17/27] remove commentted codes

---
 python/tvm/relay/op/strategy/generic.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index a82b4c8f708a..1f7a7554b10e 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -477,11 +477,6 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target):
     assert dilation == (1, 1), "not support dilate now"
     # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    # strategy.add_implementation(
-    #     wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
-    #     wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
-    #     name="conv2d_transpose_nchw.generic",
-    # )
     if groups == 1:
         strategy.add_implementation(
             wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),

From 795053e6fc7d3f4537c1f942b5d01032c76d17d2 Mon Sep 17 00:00:00 2001
From: Alicja Kwasniewska <alicja.kwasniewska@sima.ai>
Date: Wed, 10 Nov 2021 19:49:18 +0000
Subject: [PATCH 18/27] fix lint

---
 .../topi/testing/conv2d_transpose_python.py   |  40 ++++-
 .../test_topi_group_conv2d_transpose.py       | 156 ++++++++++++++++++
 2 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/topi/python/test_topi_group_conv2d_transpose.py

diff --git a/python/tvm/topi/testing/conv2d_transpose_python.py b/python/tvm/topi/testing/conv2d_transpose_python.py
index c7c0d9f2529a..5a0f1b235eb0 100644
--- a/python/tvm/topi/testing/conv2d_transpose_python.py
+++ b/python/tvm/topi/testing/conv2d_transpose_python.py
@@ -22,7 +22,7 @@
 from tvm.topi.nn.utils import get_pad_tuple
 
 
-def conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding):
+def _conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding):
     """Transposed convolution operator in NCHW layout.
 
     Parameters
@@ -141,3 +141,41 @@ def conv2d_transpose_nhwc_python(
     )
     res_nhwc = np.transpose(res_nchw, (0, 2, 3, 1))
     return res_nhwc
+
+
+def conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding, groups=1):
+    """Convolution operator in NCHW layout.
+    
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+    
+    w_np : numpy.ndarray
+        4-D with shape [in_channel, num_filter // groups, filter_height, filter_width]
+    
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+    
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+    
+    output_padding : int or a list/tuple of two ints
+        Use to disambiguate the output shape.
+    
+    groups : int
+        Number of groups
+    
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    a_slices = np.array_split(a_np, groups, axis=1)
+    w_slices = np.array_split(w_np, groups, axis=0)
+    b_slices = [
+        _conv2d_transpose_nchw_python(a_slice, w_slice, stride, padding, output_padding)
+        for a_slice, w_slice in zip(a_slices, w_slices)
+    ]
+    b_np = np.concatenate(b_slices, axis=1)
+    return b_np
diff --git a/tests/python/topi/python/test_topi_group_conv2d_transpose.py b/tests/python/topi/python/test_topi_group_conv2d_transpose.py
new file mode 100644
index 000000000000..90b7500c6cd4
--- /dev/null
+++ b/tests/python/topi/python/test_topi_group_conv2d_transpose.py
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Example code to do group transpose convolution."""
+
+import numpy as np
+import tvm
+import tvm.testing
+import tvm.topi.testing
+from tvm import te, topi
+from tvm.contrib.pickle_memoize import memoize
+from tvm.topi.utils import get_const_tuple
+
+_group_conv2d_nchw_implement = {
+    "generic": (
+        topi.nn.group_conv2d_transpose_nchw,
+        topi.generic.schedule_group_conv2d_transpose_nchw,
+    ),
+}
+
+
+def verify_group_conv2d_transpose_nchw(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    output_padding,
+    groups,
+):
+    print(
+        "Workload: (%d, %d, %s, %d, %s, %s, %s, %s, %d)"
+        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding, groups)
+    )
+
+    in_height, in_width = in_size
+    kernel_height, kernel_width = kernel
+
+    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
+    W = te.placeholder((in_channel, num_filter // groups, kernel_height, kernel_width), name="W")
+    bias = te.placeholder((num_filter, 1, 1), name="bias")
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_group_conv2d_transpose.verify_group_conv2d_transpose_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        c_np = tvm.topi.testing.conv2d_transpose_nchw_python(
+            a_np, w_np, stride, padding, output_padding, groups
+        ).astype(dtype)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
+            return
+
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nchw_implement)
+            C = fcompute(A, W, stride, padding, dtype, output_padding, groups)
+            s = fschedule([C])
+
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+        func = tvm.build(
+            s,
+            [A, W, C],
+            target,
+            name="group_conv2d_transpose_%d_%d_%s_%d_%s_%s_%s_%s_%d"
+            % (
+                batch,
+                in_channel,
+                in_size,
+                num_filter,
+                kernel,
+                stride,
+                padding,
+                output_padding,
+                groups,
+            ),
+        )
+        func(a, w, c)
+        c = c.numpy()
+        for measurement, reference in zip(c, c_np):
+            tvm.testing.assert_allclose(measurement, reference, rtol=1e-5)
+
+    for target in ["llvm"]:
+        check_target(target)
+
+
+@tvm.testing.uses_gpu
+def test_group_conv2d_transpose_nchw():
+    verify_group_conv2d_transpose_nchw(1, 1, (224, 224), 1, (1, 1), (1, 1), (0, 0, 0, 0), (0, 0), 1)
+    verify_group_conv2d_transpose_nchw(
+        1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0), 1
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 3, (224, 224), 32, (3, 3), (3, 3), (0, 0, 0, 0), (0, 0), 1
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 3, (224, 224), 32, (3, 3), (1, 1), (0, 0, 0, 0), (0, 0), 1
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 3, (224, 224), 32, (3, 3), (2, 2), (1, 1, 1, 1), (0, 0), 1
+    )
+    verify_group_conv2d_transpose_nchw(1, 4, (32, 32), 4, (5, 5), (1, 1), (0, 0, 0, 0), (0, 0), 2)
+    verify_group_conv2d_transpose_nchw(1, 9, (32, 32), 9, (5, 5), (1, 1), (0, 0, 0, 0), (0, 0), 3)
+    verify_group_conv2d_transpose_nchw(1, 4, (32, 32), 16, (5, 5), (2, 2), (1, 1, 1, 1), (0, 0), 4)
+    verify_group_conv2d_transpose_nchw(
+        1, 32, (8192, 1), 8, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0), 2
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 512, (8, 1), 256, (31, 1), (2, 1), (14, 0, 15, 0), (0, 0), 16
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 512, (8, 1), 256, (31, 1), (2, 1), (14, 0, 15, 0), (1, 0), 16
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 64, (64, 64), 64, (4, 4), (1, 1), (0, 0, 0, 0), (0, 0), 64
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 128, (32, 32), 128, (4, 4), (1, 1), (0, 0, 0, 0), (0, 0), 128
+    )
+    verify_group_conv2d_transpose_nchw(
+        1, 256, (16, 16), 256, (4, 4), (1, 1), (0, 0, 0, 0), (0, 0), 256
+    )
+
+
+if __name__ == "__main__":
+    test_group_conv2d_transpose_nchw()

From 61db0d3f65b26f6b5466fff1a74ab242c02b80ea Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Wed, 10 Nov 2021 19:57:00 +0000
Subject: [PATCH 19/27] fix lint

---
 src/relay/op/nn/convolution.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 70f480e8d8cf..c0cd6c5637a0 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -1044,7 +1044,7 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
-  static const Layout kOIHW("OIHW"); // FIXME: weight layout should be IOHW as discussed in https://github.com/apache/tvm/pull/9336
+  static const Layout kOIHW("OIHW");  // FIXME: weight layout should be IOHW as discussed in https://github.com/apache/tvm/pull/9336
 
   const Conv2DTransposeAttrs* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);

From 7af455c60dc621ef9bcd41b52b1f6d914593f8b2 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 08:22:25 +0000
Subject: [PATCH 20/27] fix c++ lint

---
 src/relay/op/nn/convolution.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index c0cd6c5637a0..4eeab316c3c8 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -1044,7 +1044,8 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
-  static const Layout kOIHW("OIHW");  // FIXME: weight layout should be IOHW as discussed in https://github.com/apache/tvm/pull/9336
+  // FIXME: weight layout should be IOHW as discussed in https://github.com/apache/tvm/pull/9336
+  static const Layout kOIHW("OIHW");  
 
   const Conv2DTransposeAttrs* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);

From 6bc2ced4eab90a193da877b0f6addcbbc4fde226 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 09:06:47 +0000
Subject: [PATCH 21/27] fix lint

---
 src/relay/op/nn/convolution.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 4eeab316c3c8..52958c2824b2 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -1044,8 +1044,8 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
-  // FIXME: weight layout should be IOHW as discussed in https://github.com/apache/tvm/pull/9336
-  static const Layout kOIHW("OIHW");  
+  // FIXME: weight layout should be IOHW.
+  static const Layout kOIHW("OIHW");
 
   const Conv2DTransposeAttrs* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);
@@ -1110,10 +1110,9 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
           << " out_channels=" << param->channels << " weight.shape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_nchw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      ICHECK(reporter->AssertEQ(dshape_nchw[1],  wshape[0]))
+      ICHECK(reporter->AssertEQ(dshape_nchw[1], wshape[0]))
           << "Conv2DTransposed: shape of weight is inconsistent with in_channels."
-          << " data.shape= " << Array<IndexExpr>(dshape_nchw)
-          << " groups= " << param->groups
+          << " data.shape= " << Array<IndexExpr>(dshape_nchw) << " groups= " << param->groups
           << " weight.shape= " << Array<IndexExpr>(wshape);
     }
     channels = wshape[1];

From 3d7ce973a0f798f4213542fe61c9613971b30381 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 11:53:08 +0000
Subject: [PATCH 22/27] fix python lint

---
 python/tvm/topi/nn/conv2d_transpose.py | 31 ++++++++++++++------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index 8173b6c002ef..997ef6da5451 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -26,14 +26,17 @@
 from .utils import get_pad_tuple
 from ..utils import simplify
 
+
 def _ntuple(n):
     def parse(x):
         if isinstance(x, collections.abc.Iterable):
             assert len(x) == n, f"Input can only have {n} elements, but got {len(x)} instead: {x}."
             return x
         return tuple(repeat(x, n))
+
     return parse
 
+
 _single = _ntuple(1)
 _pair = _ntuple(2)
 _triple = _ntuple(3)
@@ -169,25 +172,28 @@ def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output
     """
     if groups == 1:
         return conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output_padding)
-    
-    # some pre-processing and prelimnary checks 
+
+    # some pre-processing and prelimnary checks
     if out_dtype is None:
         out_dtype = data.dtype
 
     batch, in_channels, in_h, in_w = data.shape
     _, out_c, filter_h, filter_w = kernel.shape
-    assert in_channels % groups == 0, f"input channels {in_channels} must divide group size {groups}"
+    assert (
+        in_channels % groups == 0
+    ), f"input channels {in_channels} must divide group size {groups}"
     # assert out_c % groups == 0, f"output channels {in_c} must divide group size {groups}"
 
     strides = _pair(stride)
     # padding = _pair(padding)
     # output_padding = _pair(output_padding)
     # dilation = _pair(dilation)
-    
+
     stride_h, stride_w = strides
     opad_h, opad_w = output_padding
-    assert opad_h < stride_h and opad_w < stride_w, \
-        f"[{output_padding}] opad_h:{opad_h} < stride_h:{stride_h} and opad_w:{opad_w} < stride_w:{stride_w} does not satisfy."
+    assert (
+        opad_h < stride_h and opad_w < stride_w
+    ), f"[{output_padding}] opad_h:{opad_h} < stride_h:{stride_h} and opad_w:{opad_w} < stride_w:{stride_w} does not satisfy."
     # dilate data
     data_dilate = dilate(data, [1, 1, stride_h, stride_w], name="data_dilate")
     # pad data
@@ -224,16 +230,13 @@ def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output
         (batch, out_channels, out_h, out_w),
         lambda b, c, h, w: te.sum(
             data_pad[
-                b, 
-                c // (out_channels // groups) * (in_channels // groups) + dc, 
-                h + dh, 
-                w + dw
+                b, c // (out_channels // groups) * (in_channels // groups) + dc, h + dh, w + dw
             ].astype(out_dtype)
             * kernel_transform[
-                c % (out_channels // groups), 
-                c // (out_channels // groups) * (in_channels // groups) + dc, 
-                dh, 
-                dw
+                c % (out_channels // groups),
+                c // (out_channels // groups) * (in_channels // groups) + dc,
+                dh,
+                dw,
             ].astype(out_dtype),
             axis=[dc, dh, dw],
         ),

From a30b5d5f8724bb5f1d2c6d3100447ad163674b90 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 14:55:56 +0000
Subject: [PATCH 23/27] remove comments and reformat

---
 python/tvm/relay/op/strategy/generic.py |  8 +-------
 python/tvm/relay/op/strategy/x86.py     | 14 ++------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 1f7a7554b10e..e2c9dbea28cd 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -475,7 +475,6 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
     if groups == 1:
         strategy.add_implementation(
@@ -1189,12 +1188,7 @@ def _compute_nms(attrs, inputs, out_type):
         score_threshold = inputs[4]
         output_format = attrs.output_format
         return topi_compute(
-            inputs[0],
-            inputs[1],
-            max_output_size,
-            iou_threshold,
-            score_threshold,
-            output_format,
+            inputs[0], inputs[1], max_output_size, iou_threshold, score_threshold, output_format,
         )
 
     return _compute_nms
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index b25cf35b2000..6e86982b67ae 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -281,13 +281,7 @@ def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target):
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
-    # assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
-    # strategy.add_implementation(
-    #     wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
-    #     wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
-    #     name="conv2d_transpose_nchw.x86",
-    # )
     if groups == 1:
         strategy.add_implementation(
             wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
@@ -331,9 +325,7 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
         # or packed layouts.
         if layout == "NCDHW":
             strategy.add_implementation(
-                wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
-                naive_schedule,
-                name="conv3d_ncdhw.x86",
+                wrap_compute_conv3d(topi.nn.conv3d_ncdhw), naive_schedule, name="conv3d_ncdhw.x86",
             )
         elif layout == "NDHWC":
             strategy.add_implementation(
@@ -452,9 +444,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 "Recommend to use cblas/mkl/mkldnn for better performance."
             )
         strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul),
-            naive_schedule,
-            name="matmul.generic",
+            wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.generic",
         )
     return strategy
 

From 56fa3bcb71fc7618e3b8b5f00e6a733e81a8ccdd Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 15:09:18 +0000
Subject: [PATCH 24/27] lint file

---
 python/tvm/relay/op/strategy/x86.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 6e86982b67ae..a421b120fab4 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -325,7 +325,9 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
         # or packed layouts.
         if layout == "NCDHW":
             strategy.add_implementation(
-                wrap_compute_conv3d(topi.nn.conv3d_ncdhw), naive_schedule, name="conv3d_ncdhw.x86",
+                wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
+                naive_schedule,
+                name="conv3d_ncdhw.x86",
             )
         elif layout == "NDHWC":
             strategy.add_implementation(
@@ -444,7 +446,9 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 "Recommend to use cblas/mkl/mkldnn for better performance."
             )
         strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.generic",
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.generic",
         )
     return strategy
 

From 0a1a437e1c50afd9186f4e94752094b433013c5a Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 15:14:37 +0000
Subject: [PATCH 25/27] lint code

---
 python/tvm/relay/op/strategy/generic.py          |  7 ++++++-
 python/tvm/topi/nn/conv2d_transpose.py           | 16 ++++++++--------
 .../tvm/topi/testing/conv2d_transpose_python.py  | 14 +++++++-------
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index e2c9dbea28cd..ab12be16e17e 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1188,7 +1188,12 @@ def _compute_nms(attrs, inputs, out_type):
         score_threshold = inputs[4]
         output_format = attrs.output_format
         return topi_compute(
-            inputs[0], inputs[1], max_output_size, iou_threshold, score_threshold, output_format,
+            inputs[0],
+            inputs[1],
+            max_output_size,
+            iou_threshold,
+            score_threshold,
+            output_format,
         )
 
     return _compute_nms
diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index 997ef6da5451..3f3723f43592 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -141,30 +141,30 @@ def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output
     ----------
     data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
-    
+
     kernel : tvm.te.Tensor
         4-D with shape [in_channel, out_channel // groups, filter_height, filter_width]
-    
+
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]
-    
+
     padding : int or a list/tuple of 2 or 4 ints
         padding size, or
         [pad_height, pad_width] for 2 ints, or
         [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-    
+
     out_dtype : str
         The output data type. This is used for mixed precision.
-    
+
     output_padding : tuple of ints
         Used to get the right output shape for gradients
-    
+
     groups : int
         number of groups
-    
+
     out_dtype : str
         The output type. This is used for mixed precision.
-    
+
     Returns
     -------
     Output : tvm.te.Tensor
diff --git a/python/tvm/topi/testing/conv2d_transpose_python.py b/python/tvm/topi/testing/conv2d_transpose_python.py
index 5a0f1b235eb0..a38d8bc9f031 100644
--- a/python/tvm/topi/testing/conv2d_transpose_python.py
+++ b/python/tvm/topi/testing/conv2d_transpose_python.py
@@ -145,27 +145,27 @@ def conv2d_transpose_nhwc_python(
 
 def conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding, groups=1):
     """Convolution operator in NCHW layout.
-    
+
     Parameters
     ----------
     a_np : numpy.ndarray
         4-D with shape [batch, in_channel, in_height, in_width]
-    
+
     w_np : numpy.ndarray
         4-D with shape [in_channel, num_filter // groups, filter_height, filter_width]
-    
+
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]
-    
+
     padding : int or str
         Padding size, or ['VALID', 'SAME']
-    
+
     output_padding : int or a list/tuple of two ints
         Use to disambiguate the output shape.
-    
+
     groups : int
         Number of groups
-    
+
     Returns
     -------
     b_np : np.ndarray

From 562fdabad786be751bd8f471fde9167e10d6a2eb Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 15:33:56 +0000
Subject: [PATCH 26/27] fix lint

---
 python/tvm/topi/nn/conv2d_transpose.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index 3f3723f43592..e3c6c94eb290 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -193,7 +193,8 @@ def group_conv2d_transpose_nchw(data, kernel, stride, padding, out_dtype, output
     opad_h, opad_w = output_padding
     assert (
         opad_h < stride_h and opad_w < stride_w
-    ), f"[{output_padding}] opad_h:{opad_h} < stride_h:{stride_h} and opad_w:{opad_w} < stride_w:{stride_w} does not satisfy."
+    ), f"[{output_padding}] opad_h:{opad_h} < stride_h:{stride_h} \
+        and opad_w:{opad_w} < stride_w:{stride_w} does not satisfy."
     # dilate data
     data_dilate = dilate(data, [1, 1, stride_h, stride_w], name="data_dilate")
     # pad data

From 3e0db602f4be6bf6ab8f4b7329c1dfbc519fb9e7 Mon Sep 17 00:00:00 2001
From: ligeng <ligeng@mit.edu>
Date: Thu, 11 Nov 2021 16:36:42 +0000
Subject: [PATCH 27/27] update logging information in convolution.h

---
 src/relay/op/nn/convolution.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 52958c2824b2..5c67a047487f 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -1107,7 +1107,8 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
       ICHECK(reporter->AssertEQ(indexdiv(param->channels, param->groups), wshape[1]))
           << "Conv2DTransposed: shape of weight is inconsistent with out_channels, "
           << " out_channels // groups != weight.shape[1] "
-          << " out_channels=" << param->channels << " weight.shape=" << Array<IndexExpr>(wshape);
+          << " out_channels=" << param->channels << " groups=" << param->groups
+          << " weight.shape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_nchw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
       ICHECK(reporter->AssertEQ(dshape_nchw[1], wshape[0]))