From eb917d8b122c985b794581f8586e307b4578157a Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 1 Mar 2022 16:56:51 -0800
Subject: [PATCH 01/10] implement matmulinteger

---
 python/tvm/relay/frontend/onnx.py          | 59 +++++++++++++++++++++-
 tests/python/frontend/onnx/test_forward.py |  2 +-
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 474b688e2ad8..4ff5b034194f 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -3916,7 +3916,7 @@ class QLinearMatMul(OnnxOpConverter):
     """
 
     @classmethod
-    def _impl_v10(cls, inputs, attr, params):
+    def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=["int8", "uint8"]):
 
         # Some of the ops used below take scalar-like inputs, and may require either
         # of the following:
@@ -3966,7 +3966,7 @@ def try_resolve_to_const(x, dtype_override=None):
         assert b_zp_type.dtype == b_type.dtype
 
         assert y_scale_type.dtype == "float32"
-        assert y_zp_type.dtype in ["int8", "uint8"]
+        assert y_zp_type.dtype in expected_out_dtypes
 
         # TODO: relax this limitation in a future version of this importer.
         a_rank = len(a_shape)
@@ -4053,6 +4053,60 @@ def try_resolve_to_const(x, dtype_override=None):
         return y
 
 
+class MatMulInteger(OnnxOpConverter):
+    """Operator converter for MatMulInteger."""
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        # The production MUST never overflow. The accumulation may overflow if and only if in 32 bits
+        a = inputs[0]
+        b = inputs[1]
+
+        a_dtype = infer_type(a).checked_type.dtype
+        b_dtype = infer_type(b).checked_type.dtype
+
+        assert a_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for first input"
+        assert b_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for second input"
+
+        assert a_dtype == b_dtype, "MatMulInteger: input dtypes must match"
+
+        a_scale = _op.const(1.0, dtype="float32")
+        b_scale = _op.const(1.0, dtype="float32")
+        out_scale = _op.const(1.0, dtype="float32")
+
+        a_zero_point = _op.const(0.0, dtype=a_dtype)
+        b_zero_point = _op.const(0.0, dtype=b_dtype)
+        # We use a_dtype here because a_dtype and b_dtype are equivalent
+        out_zero_point = _op.const(0.0, dtype="int32")
+
+        if len(inputs) == 4:
+            a_zero_point = inputs[2]
+            b_zero_point = inputs[3]
+
+            a_zp_dtype = infer_type(a_zero_point).checked_type.dtype
+            b_zp_dtype = infer_type(b_zero_point).checked_type.dtype
+            assert (
+                a_zp_dtype == a_dtype and b_zp_dtype == b_dtype
+            ), "MatMulInteger: input dtype doesn't match zero point dtype"
+        elif len(inputs) != 2:
+            raise AssertionError(
+                "MatMulInteger op takes 2 or 4 inputs, {} given".format(len(inputs))
+            )
+
+        inputs = [
+            a,
+            a_scale,
+            a_zero_point,
+            b,
+            b_scale,
+            b_zero_point,
+            out_scale,
+            out_zero_point,
+        ]
+
+        return QLinearMatMul.get_converter(10)(inputs, attr, params, expected_out_dtypes=["int32"])
+
+
 class QLinearMul(OnnxOpConverter):
     """Operator converter for QLinearMul from Microsoft onnxruntime contrib opset."""
 
@@ -4781,6 +4835,7 @@ def _get_convert_map(opset):
         "Softsign": Softsign.get_converter(opset),
         "Gemm": Gemm.get_converter(opset),
         "MatMul": MatMul.get_converter(opset),
+        "MatMulInteger": MatMulInteger.get_converter(opset),
         "MatMulInteger16": MatMulInteger16.get_converter(opset),
         "Mod": Mod.get_converter(opset),
         "Xor": Renamer("logical_xor"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 0751f4a2e293..7a5b5d190a3a 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5053,7 +5053,7 @@ def verify_eyelike(indata):
     "test_loop11",
     "test_loop13_seq",
     "test_lstm_batchwise",
-    "test_matmulinteger",
+    # "test_matmulinteger",
     "test_maxpool_with_argmax_2d_precomputed_pads",
     "test_maxpool_with_argmax_2d_precomputed_strides",
     "test_maxunpool_export_with_output_shape",

From 8cde209cfa5b2d647fc4c18346bced7868f29386 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Wed, 2 Mar 2022 08:43:34 -0800
Subject: [PATCH 02/10] rm test

---
 tests/python/frontend/onnx/test_forward.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 7a5b5d190a3a..94fd0a5de40b 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5053,7 +5053,6 @@ def verify_eyelike(indata):
     "test_loop11",
     "test_loop13_seq",
     "test_lstm_batchwise",
-    # "test_matmulinteger",
     "test_maxpool_with_argmax_2d_precomputed_pads",
     "test_maxpool_with_argmax_2d_precomputed_strides",
     "test_maxunpool_export_with_output_shape",

From 696399f62868c570d51b7c39ca06e83ab8602619 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Wed, 2 Mar 2022 08:46:56 -0800
Subject: [PATCH 03/10] rm outdated comments

---
 python/tvm/relay/frontend/onnx.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 4ff5b034194f..f70d3522bfa1 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -4058,7 +4058,6 @@ class MatMulInteger(OnnxOpConverter):
 
     @classmethod
     def _impl_v10(cls, inputs, attr, params):
-        # The production MUST never overflow. The accumulation may overflow if and only if in 32 bits
         a = inputs[0]
         b = inputs[1]
 
@@ -4076,7 +4075,6 @@ def _impl_v10(cls, inputs, attr, params):
 
         a_zero_point = _op.const(0.0, dtype=a_dtype)
         b_zero_point = _op.const(0.0, dtype=b_dtype)
-        # We use a_dtype here because a_dtype and b_dtype are equivalent
         out_zero_point = _op.const(0.0, dtype="int32")
 
         if len(inputs) == 4:

From 7f4588d1b38106eba1181dba74d24f24ccdb8017 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Thu, 3 Mar 2022 00:29:46 -0800
Subject: [PATCH 04/10] fix lint and review

---
 python/tvm/relay/frontend/onnx.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index f70d3522bfa1..ab0eeb091043 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -3913,10 +3913,17 @@ class QLinearMatMul(OnnxOpConverter):
     - Only supports 2D input tensors.
     - Not guaranteed to meet the integer-overflow behavior stipulated in the
       ONNX documentation for this operator.
+
+    The QLinearMatMul converter is re-used for MatMulInteger and is adapted for
+    the latter with the optional `expected_out_dtypes` argument.
     """
 
     @classmethod
-    def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=["int8", "uint8"]):
+    def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=None):
+        if expected_out_dtypes is None:
+            # The default QLinearMatMul converter is expected to have one of
+            # these output dtypes.
+            expected_out_dtypes = ["int8", "uint8"]
 
         # Some of the ops used below take scalar-like inputs, and may require either
         # of the following:
@@ -4028,6 +4035,11 @@ def try_resolve_to_const(x, dtype_override=None):
         matmul_result_scale_scalar = fold_constant(_op.multiply(a_scale_scalar, b_scale_scalar))
         matmul_result_zp_scalar = _op.const(0, dtype="int32")
 
+        if "int32" in expected_out_dtypes:
+            # This is the adaptation of the QLinearMatMul converter for MatMulInteger,
+            # in the MatMulInteger case we skip the unnecessary requantization step.
+            return matmul_result
+
         # requantize requires y_scale to be constant,
         # if y_scale is not constant, doing dequantize -> quantize
         if isinstance(y_scale_scalar, _expr.Constant):

From f80198cbcea46e685f8bc56b25f847fe4eee5570 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Fri, 4 Mar 2022 12:38:51 -0800
Subject: [PATCH 05/10] wip

---
 tests/python/frontend/onnx/test_forward.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 94fd0a5de40b..b15b87550615 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5150,6 +5150,9 @@ def verify_eyelike(indata):
 @pytest.mark.parametrize("onnx_test", onnx_test_folders)
 @tvm.testing.parametrize_targets
 def test_onnx_nodes(target, dev, onnx_test):
+    # breakpoint()
+    target = "cuda"
+    dev = tvm.cuda(0)
     target_kind = tvm.target.Target(target).kind.name
 
     if onnx_test in unsupported_onnx_tests:
@@ -5189,6 +5192,8 @@ def test_onnx_nodes(target, dev, onnx_test):
             else:
                 raise ImportError(str(tensor) + " not labeled as an import or an output")
 
+    breakpoint()
+    print(inputs)
     tvm_val = get_tvm_output_with_vm(onnx_model, inputs, target, dev)
     if len(outputs) == 1:
         tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=rtol, atol=atol)

From 132c31464f9ff9846b72d2dca0f9ba4b883429ea Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Mon, 28 Mar 2022 15:16:03 -0700
Subject: [PATCH 06/10] fixes

---
 python/tvm/topi/cuda/tensorcore_alter_op.py | 6 ++++--
 tests/python/frontend/onnx/test_forward.py  | 3 ---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index 080ddf28b7c2..d1d27587d128 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -148,16 +148,18 @@ def _dense_legalize(attrs, inputs, arg_types):
 
     # Pad input and output channels to use tensorcore schedule.
     if dtype in ["float16", "int8", "uint8"]:
-        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
+        # The shape of (M, K, N) must be multiple of
+        # (16, 16, 16) or (32, 16, 8) or (8, 16, 32) or (4, 4, 4)
         if (
             (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
             or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
             or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+            or (M % 4 == 0 and K % 4 == 0 and N % 4 == 0)
         ):
             # no need to pad
             return None
 
-        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32), (4, 4, 4)]
     elif dtype in ["int4", "uint4"]:
         if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
             # no need to pad
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index b15b87550615..bce89fb38a21 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5150,7 +5150,6 @@ def verify_eyelike(indata):
 @pytest.mark.parametrize("onnx_test", onnx_test_folders)
 @tvm.testing.parametrize_targets
 def test_onnx_nodes(target, dev, onnx_test):
-    # breakpoint()
     target = "cuda"
     dev = tvm.cuda(0)
     target_kind = tvm.target.Target(target).kind.name
@@ -5192,8 +5191,6 @@ def test_onnx_nodes(target, dev, onnx_test):
             else:
                 raise ImportError(str(tensor) + " not labeled as an import or an output")
 
-    breakpoint()
-    print(inputs)
     tvm_val = get_tvm_output_with_vm(onnx_model, inputs, target, dev)
     if len(outputs) == 1:
         tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=rtol, atol=atol)

From 36c2690b6e1852b667775f4d67d328fed3cbf849 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Mon, 28 Mar 2022 15:16:56 -0700
Subject: [PATCH 07/10] fix

---
 tests/python/frontend/onnx/test_forward.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index bce89fb38a21..94fd0a5de40b 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5150,8 +5150,6 @@ def verify_eyelike(indata):
 @pytest.mark.parametrize("onnx_test", onnx_test_folders)
 @tvm.testing.parametrize_targets
 def test_onnx_nodes(target, dev, onnx_test):
-    target = "cuda"
-    dev = tvm.cuda(0)
     target_kind = tvm.target.Target(target).kind.name
 
     if onnx_test in unsupported_onnx_tests:

From a3ba9ad7754eb9b89f5f21133e581a291a8b49dd Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 5 Apr 2022 17:42:30 -0700
Subject: [PATCH 08/10] alter tests

---
 tests/python/relay/test_pass_legalize_tensorcore.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 97860630dea5..4933520ac3fe 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -249,6 +249,7 @@ def expected():
             a = before()
             a = run_opt_pass(a, transform.Legalize())
             b = run_opt_pass(expected(), transform.InferType())
+
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
     # dense
@@ -258,8 +259,8 @@ def expected():
         _test_legalize_dense((8, 15), (32, 15), (0, 1, 0), dtype)
         _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype)
         _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype)
-        _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype)
-        _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), dtype, False)
+        _test_legalize_dense((3, 16), (32, 16), (1, 0, 0), dtype)
+        _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), dtype, False)
 
     # Test if units parameter is correctly updated
     _test_legalize_dense((8, 16), (30, 16), (0, 0, 2), "float16", units=30)

From 749cc1642679ad8ca614ca3261a44dc409fe107c Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 12 Apr 2022 16:08:15 -0700
Subject: [PATCH 09/10] extra 4x4x4 step

---
 python/tvm/topi/cuda/tensorcore_alter_op.py   | 26 +++++++++++++++----
 .../relay/test_pass_legalize_tensorcore.py    |  4 +--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index d1d27587d128..972f77cf6f36 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -149,17 +149,17 @@ def _dense_legalize(attrs, inputs, arg_types):
     # Pad input and output channels to use tensorcore schedule.
     if dtype in ["float16", "int8", "uint8"]:
         # The shape of (M, K, N) must be multiple of
-        # (16, 16, 16) or (32, 16, 8) or (8, 16, 32) or (4, 4, 4)
+        # (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
+        # from https://arxiv.org/pdf/1811.09736.pdf
         if (
             (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
             or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
             or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
-            or (M % 4 == 0 and K % 4 == 0 and N % 4 == 0)
         ):
             # no need to pad
             return None
 
-        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32), (4, 4, 4)]
+        candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
     elif dtype in ["int4", "uint4"]:
         if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
             # no need to pad
@@ -172,7 +172,19 @@ def _dense_legalize(attrs, inputs, arg_types):
 
     if extra_flops_ratio > 2:
         logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
-        return None
+
+        # If tensorcore schedule padding fails, pad to nearest upward 4x4x4 as long as
+        # the additional flops ratio isn't double or more.
+        # Note that 4x4x4 is invalid for tensorcore scheduling, but padding upwards to 4x4x4
+        # doesn't hurt if tensorcore padding has already failed.
+        if M % 4 == 0 and K % 4 == 0 and N % 4 == 0:
+            # No need to pad
+            return None
+        (dm, dk, dn) = _pad_to(M, K, N, (4, 4, 4))
+        extra_flops_ratio = _extra_flops(M, K, N, dm, dk, dn) / (M * K * N)
+
+        if extra_flops_ratio > 2:
+            return None
 
     logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio)
 
@@ -200,7 +212,7 @@ def pad_to_tensorcore(M, K, N, candidates):
     best_pad = (0, 0, 0)
     for padding in candidates:
         dm, dk, dn = _pad_to(M, K, N, padding)
-        e = (M + dm) * (N + dn) * (K + dk) - M * N * K
+        e = _extra_flops(M, K, N, dm, dk, dn)
         # print(dm, dk, dn, e, flops)
         if e < extra_flops:
             extra_flops = e
@@ -208,6 +220,10 @@ def pad_to_tensorcore(M, K, N, candidates):
     return best_pad, extra_flops / flops
 
 
+def _extra_flops(M, K, N, dm, dk, dn):
+    return (M + dm) * (N + dn) * (K + dk) - M * N * K
+
+
 def _pad_to(M, K, N, PADDING):
     dm, dk, dn = 0, 0, 0
 
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 4933520ac3fe..89ec57c3f8c8 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -259,7 +259,7 @@ def expected():
         _test_legalize_dense((8, 15), (32, 15), (0, 1, 0), dtype)
         _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype)
         _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype)
-        _test_legalize_dense((3, 16), (32, 16), (1, 0, 0), dtype)
+        _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype)
         _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), dtype, False)
 
     # Test if units parameter is correctly updated
@@ -272,7 +272,7 @@ def expected():
     _test_legalize_dense((7, 31), (31, 31), (1, 1, 1), "int4")
     _test_legalize_dense((3, 32), (32, 32), (5, 0, 0), "int4")
     _test_legalize_dense((8, 16), (32, 16), (0, 16, 0), "int4")
-    _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), "int4", False)
+    _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), "int4", False)
 
 
 @tvm.testing.uses_gpu

From bbdf02c9e62f123bd37b0c78e0bb9967cce8b5e3 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 12 Apr 2022 17:06:01 -0700
Subject: [PATCH 10/10] comments

---
 python/tvm/topi/cuda/tensorcore_alter_op.py      | 16 ++++++++--------
 .../relay/test_pass_legalize_tensorcore.py       |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index 972f77cf6f36..0ba428014548 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -148,9 +148,7 @@ def _dense_legalize(attrs, inputs, arg_types):
 
     # Pad input and output channels to use tensorcore schedule.
     if dtype in ["float16", "int8", "uint8"]:
-        # The shape of (M, K, N) must be multiple of
-        # (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
-        # from https://arxiv.org/pdf/1811.09736.pdf
+        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
         if (
             (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
             or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
@@ -169,10 +167,10 @@ def _dense_legalize(attrs, inputs, arg_types):
         return None
 
     (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
+    skip_pad = extra_flops_ratio > 2
 
-    if extra_flops_ratio > 2:
-        logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
-
+    if skip_pad and dtype in ["int8", "uint8"]:
+        skip_pad = False
         # If tensorcore schedule padding fails, pad to nearest upward 4x4x4 as long as
         # the additional flops ratio isn't double or more.
         # Note that 4x4x4 is invalid for tensorcore scheduling, but padding upwards to 4x4x4
@@ -182,9 +180,11 @@ def _dense_legalize(attrs, inputs, arg_types):
             return None
         (dm, dk, dn) = _pad_to(M, K, N, (4, 4, 4))
         extra_flops_ratio = _extra_flops(M, K, N, dm, dk, dn) / (M * K * N)
+        skip_pad = extra_flops_ratio > 2
 
-        if extra_flops_ratio > 2:
-            return None
+    if skip_pad:
+        logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
+        return None
 
     logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio)
 
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 89ec57c3f8c8..0e3c171d87da 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -272,7 +272,7 @@ def expected():
     _test_legalize_dense((7, 31), (31, 31), (1, 1, 1), "int4")
     _test_legalize_dense((3, 32), (32, 32), (5, 0, 0), "int4")
     _test_legalize_dense((8, 16), (32, 16), (0, 16, 0), "int4")
-    _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), "int4", False)
+    _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), "int4", False)
 
 
 @tvm.testing.uses_gpu