From bfbcf42e2452919539a8f6d26f9a6d52053a82d9 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Wed, 9 Jun 2021 20:08:10 +0800
Subject: [PATCH 1/4] add conv2d leg

---
 python/tvm/topi/cuda/conv2d_alter_op.py       | 153 +++++++++++++++---
 python/tvm/topi/cuda/tensorcore_alter_op.py   |  10 +-
 .../relay/test_pass_legalize_tensorcore.py    | 127 ++++++++++++---
 3 files changed, 246 insertions(+), 44 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 067f27262b06..331cc0aad403 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -270,6 +270,60 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     return None
 
 
+def _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor):
+    # Pad batch size
+    if db != 0:
+        data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, db), (0, 0)))
+
+    # Pad input channel
+    if di != 0:
+        data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+
+    # Pad output channel
+    if do != 0:
+        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, do), (0, 0)))
+
+    if do != 0:
+        new_out_channel = out_channel + do
+        new_attrs["channels"] = new_out_channel
+
+    out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+    if db != 0 or do != 0:
+        original_out_shape = [x.value for x in output_tensor.shape]
+        out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+
+    return out
+
+
+def _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor):
+    # Pad batch size
+    if db != 0:
+        data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0)))
+
+    # Pad input channel
+    if di != 0:
+        data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0)))
+
+    # Pad output channel
+    if do != 0:
+        kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do)))
+
+    if do != 0:
+        new_out_channel = out_channel + do
+        new_attrs["channels"] = new_out_channel
+
+    out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+    if db != 0 or do != 0:
+        original_out_shape = [x.value for x in output_tensor.shape]
+        out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+
+    return out
+
+
 @conv2d_legalize.register("cuda")
 def _conv2d_legalize(attrs, inputs, arg_types):
     """Legalizes Conv2D op.
@@ -347,7 +401,47 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
             return out
-    elif data_dtype in ["float16"]:  # todo: support int8/int4
+        elif data_layout == "NHWC" and kernel_layout == "HWIO":
+            batch = data_tensor.shape[0].value
+            in_channel = data_tensor.shape[3].value
+            out_channel = kernel_tensor.shape[3].value
+
+            if (
+                (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
+                or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
+                or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
+            ):
+                # no need to pad
+                return None
+
+            candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+
+            if extra_flops > 2:
+                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+                return None
+
+            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
+
+            return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+        elif data_layout == "HWNC" and kernel_layout == "HWOI":
+            batch = data_tensor.shape[2].value
+            in_channel = data_tensor.shape[3].value
+            out_channel = kernel_tensor.shape[2].value
+
+            if batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0:
+                return None
+
+            candidates = [(8, 16, 32)]
+            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+
+            if extra_flops > 2:
+                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+                return None
+            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
+
+            return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+    elif data_dtype in ["float16"]:
         if data_layout == "NHWC" and kernel_layout == "HWIO":
             batch = data_tensor.shape[0].value
             in_channel = data_tensor.shape[3].value
@@ -361,7 +455,8 @@ def _conv2d_legalize(attrs, inputs, arg_types):
                 # no need to pad
                 return None
 
-            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel)
+            candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
 
             if extra_flops > 2:
                 logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -369,28 +464,46 @@ def _conv2d_legalize(attrs, inputs, arg_types):
 
             logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
-            # Pad batch size
-            if db != 0:
-                data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0)))
+            return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+    elif data_dtype in ["int4", "uint4"]:
+        if data_layout == "NHWC" and kernel_layout == "HWIO":
+            batch = data_tensor.shape[0].value
+            in_channel = data_tensor.shape[3].value
+            out_channel = kernel_tensor.shape[3].value
 
-            # Pad input channel
-            if di != 0:
-                data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
-                kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0)))
+            if (
+                (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
+                or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
+                or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
+            ):
+                # no need to pad
+                return None
 
-            # Pad output channel
-            if do != 0:
-                kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do)))
+            candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
 
-            if do != 0:
-                new_out_channel = out_channel + do
-                new_attrs["channels"] = new_out_channel
+            if extra_flops > 2:
+                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+                return None
 
-            out = relay.nn.conv2d(data, kernel, **new_attrs)
+            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
-            if db != 0 or do != 0:
-                original_out_shape = [x.value for x in output_tensor.shape]
-                out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+            return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+        elif data_layout == "HWNC" and kernel_layout == "HWOI":
+            batch = data_tensor.shape[2].value
+            in_channel = data_tensor.shape[3].value
+            out_channel = kernel_tensor.shape[2].value
 
-            return out
+            if batch % 8 == 0 and in_channel % 32 == 0 and out_channel % 8 == 0:
+                return None
+
+            candidates = [(8, 32, 8)]
+            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+
+            if extra_flops > 2:
+                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+                return None
+            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
+
+            return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
     return None
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index aec7acbfde56..eb7c71ddf1c9 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -71,7 +71,8 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
             # no need to pad
             return None
 
-        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N)
+        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
 
         if extra_flops > 2:
             logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -145,7 +146,8 @@ def _dense_legalize(attrs, inputs, arg_types):
             # no need to pad
             return None
 
-        (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N)
+        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+        (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
 
         if extra_flops_ratio > 2:
             logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
@@ -171,10 +173,8 @@ def _dense_legalize(attrs, inputs, arg_types):
     return None
 
 
-def pad_to_tensorcore(M, K, N):
+def pad_to_tensorcore(M, K, N, candidates):
     """pad shape to enable tensorcore"""
-    candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-
     flops = M * K * N
     extra_flops = math.inf
     best_pad = (0, 0, 0)
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index f45e39047238..57d5865c3b33 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -36,18 +36,18 @@ def run_opt_pass(expr, passes):
 
 
 @tvm.testing.uses_gpu
-def test_legalize_conv2d():
-    """test legalize conv2d to enable tensorcore"""
+def test_legalize_conv2d_NHWC():
+    """test legalize NHWC conv2d to enable tensorcore"""
 
-    def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
+    def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
         out_channel = kernel_shape[3]
         out_shape = list(data_shape)
         out_shape[3] = out_channel
         db, di, do = pad_shape
 
         def before():
-            x = relay.var("x", shape=data_shape, dtype="float16")
-            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            x = relay.var("x", shape=data_shape, dtype=dtype)
+            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
             y = relay.nn.conv2d(
                 x,
                 weight,
@@ -67,12 +67,12 @@ def legalize_conv2d(attrs, inputs, types):
         def expected():
             if not do_pad:
                 return before()
-            x = relay.var("x", shape=data_shape, dtype="float16")
+            x = relay.var("x", shape=data_shape, dtype=dtype)
             if db or di:
                 x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di)))
             else:
                 x_pad = x
-            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
             if di or do:
                 weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do)))
             else:
@@ -99,20 +99,108 @@ def expected():
             b = run_opt_pass(expected(), transform.InferType())
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
+    for dtype in ["float16", "int8", "int4"]:
+        # conv2d pad batch
+        _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0), dtype)
+        _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0), dtype)
+        _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), dtype, False)
+        # conv2d pad in_channel
+        _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0), dtype)
+        _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0), dtype)
+        _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0), dtype)
+        _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), dtype, False)
+        # conv2d pad out_channel
+        _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1), dtype)
+        _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31), dtype)
+        _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), dtype, False)
+
+@tvm.testing.uses_gpu
+def test_legalize_conv2d_HWNC():
+    """test legalize HWNC conv2d to enable tensorcore"""
+
+    def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
+        out_channel = kernel_shape[2]
+        out_shape = list(data_shape)
+        out_shape[3] = out_channel
+        db, di, do = pad_shape
+
+        def before():
+            x = relay.var("x", shape=data_shape, dtype=dtype)
+            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=out_channel,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="HWNC",
+                kernel_layout="HWOI",
+            )
+            y = relay.Function([x, weight], y)
+            return y
+
+        def legalize_conv2d(attrs, inputs, types):
+            with tvm.target.Target("cuda"):
+                return topi.nn.conv2d_legalize(attrs, inputs, types)
+
+        def expected():
+            if not do_pad:
+                return before()
+            x = relay.var("x", shape=data_shape, dtype=dtype)
+            if db or di:
+                x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, 0), (0, db), (0, di)))
+            else:
+                x_pad = x
+            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
+            if di or do:
+                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, do), (0, di)))
+            else:
+                weight_pad = weight
+            y_pad = relay.nn.conv2d(
+                x_pad,
+                weight=weight_pad,
+                channels=out_channel + do,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="HWNC",
+                kernel_layout="HWOI",
+            )
+            if db or do:
+                y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
+            else:
+                y = y_pad
+            y = relay.Function([x, weight], y)
+            return y
+
+        with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
+            a = before()
+            a = run_opt_pass(a, transform.Legalize())
+            b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+    # data_layout="HWNC",kernel_layout="HWOI"
     # conv2d pad batch
-    _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0))
-    _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0))
-    _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False)
+    _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int8")
+    _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int8")
+    _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), "int8", False)
+    _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int4")
+    _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int4")
+    _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), "int4", False)
     # conv2d pad in_channel
-    _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0))
-    _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0))
-    _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0))
-    _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False)
+    _test_legalize_conv2d((16, 16, 8, 63), (3, 3, 64, 63), (0, 1, 0), "int8")
+    _test_legalize_conv2d((16, 16, 8, 33), (3, 3, 64, 33), (0, 15, 0), "int8")
+    _test_legalize_conv2d((16, 16, 8, 13), (3, 3, 64, 13), (0, 3, 0), "int8")
+    _test_legalize_conv2d((16, 16, 8, 1), (3, 3, 64, 1), (0, 0, 0), "int8", False)
+    _test_legalize_conv2d((16, 16, 8, 63), (3, 3, 64, 63), (0, 1, 0), "int4")
+    _test_legalize_conv2d((16, 16, 8, 33), (3, 3, 64, 33), (0, 31, 0), "int4")
+    _test_legalize_conv2d((16, 16, 8, 13), (3, 3, 64, 13), (0, 19, 0), "int4")
+    _test_legalize_conv2d((16, 16, 8, 1), (3, 3, 64, 1), (0, 0, 0), "int4", False)
     # conv2d pad out_channel
-    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1))
-    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31))
-    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False)
-
+    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 63, 64), (0, 0, 1), "int8")
+    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 33, 64), (0, 0, 31), "int8")
+    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 1, 64), (0, 0, 0), "int8", False)
+    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 63, 64), (0, 0, 1), "int4")
+    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 33, 64), (0, 0, 7), "int4")
+    _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 1, 64), (0, 0, 0), "int4", False)
 
 @tvm.testing.uses_gpu
 def test_legalize_dense():
@@ -234,6 +322,7 @@ def expected():
 
 
 if __name__ == "__main__":
-    test_legalize_conv2d()
+    test_legalize_conv2d_NHWC()
+    test_legalize_conv2d_HWNC()
     test_legalize_dense()
     test_legalize_batch_matmul()

From b045e365483f87097a6b9478ca0fce747837c578 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Wed, 9 Jun 2021 20:11:35 +0800
Subject: [PATCH 2/4] minor fix

---
 tests/python/relay/test_pass_legalize_tensorcore.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 57d5865c3b33..e91e48c0c451 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -114,6 +114,7 @@ def expected():
         _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31), dtype)
         _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), dtype, False)
 
+
 @tvm.testing.uses_gpu
 def test_legalize_conv2d_HWNC():
     """test legalize HWNC conv2d to enable tensorcore"""
@@ -177,7 +178,6 @@ def expected():
             a = run_opt_pass(a, transform.Legalize())
             b = run_opt_pass(expected(), transform.InferType())
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
-    # data_layout="HWNC",kernel_layout="HWOI"
     # conv2d pad batch
     _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int8")
     _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int8")
@@ -202,6 +202,7 @@ def expected():
     _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 33, 64), (0, 0, 7), "int4")
     _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 1, 64), (0, 0, 0), "int4", False)
 
+
 @tvm.testing.uses_gpu
 def test_legalize_dense():
     def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):

From ced4c69fbfc438d053fa396983f721645fd08d85 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Wed, 9 Jun 2021 20:25:57 +0800
Subject: [PATCH 3/4] fix pylint

---
 python/tvm/topi/cuda/conv2d_alter_op.py       | 20 ++++++++++++++-----
 .../relay/test_pass_legalize_tensorcore.py    |  1 +
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 331cc0aad403..dc0dc1ade646 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -415,7 +415,9 @@ def _conv2d_legalize(attrs, inputs, arg_types):
                 return None
 
             candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+            (db, di, do), extra_flops = pad_to_tensorcore(
+                batch, in_channel, out_channel, candidates
+            )
 
             if extra_flops > 2:
                 logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -433,7 +435,9 @@ def _conv2d_legalize(attrs, inputs, arg_types):
                 return None
 
             candidates = [(8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+            (db, di, do), extra_flops = pad_to_tensorcore(
+                batch, in_channel, out_channel, candidates
+            )
 
             if extra_flops > 2:
                 logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -456,7 +460,9 @@ def _conv2d_legalize(attrs, inputs, arg_types):
                 return None
 
             candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+            (db, di, do), extra_flops = pad_to_tensorcore(
+                batch, in_channel, out_channel, candidates
+            )
 
             if extra_flops > 2:
                 logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -480,7 +486,9 @@ def _conv2d_legalize(attrs, inputs, arg_types):
                 return None
 
             candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+            (db, di, do), extra_flops = pad_to_tensorcore(
+                batch, in_channel, out_channel, candidates
+            )
 
             if extra_flops > 2:
                 logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -498,7 +506,9 @@ def _conv2d_legalize(attrs, inputs, arg_types):
                 return None
 
             candidates = [(8, 32, 8)]
-            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel, candidates)
+            (db, di, do), extra_flops = pad_to_tensorcore(
+                batch, in_channel, out_channel, candidates
+            )
 
             if extra_flops > 2:
                 logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index e91e48c0c451..1312b396fe4c 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -178,6 +178,7 @@ def expected():
             a = run_opt_pass(a, transform.Legalize())
             b = run_opt_pass(expected(), transform.InferType())
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
     # conv2d pad batch
     _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int8")
     _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int8")

From 0baf0d3e57c5104b3520a7cd01e9f002861c49c7 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Wed, 9 Jun 2021 20:34:07 +0800
Subject: [PATCH 4/4] fix pylint

---
 python/tvm/topi/cuda/conv2d_alter_op.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index dc0dc1ade646..4863a06b728d 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -401,7 +401,8 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
             return out
-        elif data_layout == "NHWC" and kernel_layout == "HWIO":
+
+        if data_layout == "NHWC" and kernel_layout == "HWIO":
             batch = data_tensor.shape[0].value
             in_channel = data_tensor.shape[3].value
             out_channel = kernel_tensor.shape[3].value
@@ -426,7 +427,8 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
             return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
-        elif data_layout == "HWNC" and kernel_layout == "HWOI":
+
+        if data_layout == "HWNC" and kernel_layout == "HWOI":
             batch = data_tensor.shape[2].value
             in_channel = data_tensor.shape[3].value
             out_channel = kernel_tensor.shape[2].value
@@ -445,6 +447,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
             return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+
     elif data_dtype in ["float16"]:
         if data_layout == "NHWC" and kernel_layout == "HWIO":
             batch = data_tensor.shape[0].value
@@ -471,6 +474,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
             return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+
     elif data_dtype in ["int4", "uint4"]:
         if data_layout == "NHWC" and kernel_layout == "HWIO":
             batch = data_tensor.shape[0].value
@@ -497,7 +501,8 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
             return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
-        elif data_layout == "HWNC" and kernel_layout == "HWOI":
+
+        if data_layout == "HWNC" and kernel_layout == "HWOI":
             batch = data_tensor.shape[2].value
             in_channel = data_tensor.shape[3].value
             out_channel = kernel_tensor.shape[2].value
@@ -516,4 +521,5 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
             return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+
     return None