From 88ce7bddc0b6c5833c4d7be6f200909f992b00da Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Thu, 22 Dec 2022 15:02:56 +0000
Subject: [PATCH 1/3] [TOPI][bugfix] Fix a bug in arm_cpu int8 dotprod schedule
 and modernize tests

topi.arm_cpu.schedule_conv2d_NHWC_quantized_native was failing
compilation in case the input channels divided by 4 was less than 4.

This was because we were splitting this axis by a factor of 4 to create
appropriate loop nest for tensorize, but then tensorize was assuming
the outer axis bound was divisible by 4.

If the outer bound was less than 4, compilation failed, if it was
greater than 4 but not divisible by 4, we were occasionally
accessing data outside of tensor, which luckily was padded due to
alignment (I think).

So here we make sure that we explicitly pad the input axis such that
the outer loop will always be divisible by 4.

There are also some refactors to test_topi_conv2d_int8.py:
- decouple the tests using pytest.parametrize
- extend the NHWC int8 schedules test to test against arm
targets and various schedules. When these schedules were initialy
added, we didn't have Arm CI, so only compilation was tested, now
we can also run the workloads on Arm targets.

Change-Id: Iba7db541d8fff54736dabc310a9657f18623e556
---
 python/tvm/topi/nn/conv2d.py                  |    4 +-
 .../topi/python/test_topi_conv2d_int8.py      | 1112 ++++++++---------
 2 files changed, 537 insertions(+), 579 deletions(-)

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index db1bcaa27694..42d5ee3f2710 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -606,8 +606,8 @@ def conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols):
     if N % tile_rows != 0:
         pad_N = tile_rows - (N % tile_rows)
 
-    if K % tile_cols != 0:
-        pad_K = tile_cols - (K % tile_cols)
+    if K % (tile_cols * 4) != 0:
+        pad_K = (tile_cols * 4) - (K % (tile_cols * 4))
 
     N_padded = N + pad_N
     K_padded = K + pad_K
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index c84f39ab5a66..636241a5fc02 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -35,261 +35,135 @@
 import platform
 
 
-def compile_conv2d_NHWC_gemm_int8_arm(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
-    bias = te.placeholder((num_filter,), name="bias", dtype="int8")
-    dtype = "int32"
-    devices = [
-        (
-            "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
-            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-        ),
-        (
-            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-        ),
-        (
-            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-            topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
-            topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
-        ),
-        # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension
-        # (
-        #   "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
-        #   topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-        #   topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-        # ),
-    ]
-
-    for device_tuple in devices:
-        target = device_tuple[0]
-        compute = device_tuple[1]
-        schedule = device_tuple[2]
-
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Compiling on arm AArch64 target: %s" % target)
-        with tvm.target.Target(target) as tvm_target:
-            assert tvm_target.features.is_aarch64, "AArch64 target not recognized"
+devices = [
+    (
+        "llvm",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    ),
+    (
+        "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    ),
+    (
+        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    ),
+    (
+        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
+    ),
+    # TODO(giuseros) We need LLVM-11 in order to compile with +i8mm extension
+    # (
+    # "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
+    # topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+    # topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    # ),
+]
+
+
+@tvm.testing.requires_llvm
+@pytest.mark.parametrize("device", devices)
+@pytest.mark.parametrize(
+    "params",
+    [
+        # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID' padding)
+        (1, 3, 299, 32, 3, 2, "SAME", 1, False, False),
+        (1, 32, 149, 32, 3, 1, "SAME", 2, False, False),
+        (4, 32, 147, 64, 3, 1, "SAME", 1, False, False),
+        (1, 64, 73, 80, 1, 1, "SAME", 1, False, False),
+        (1, 80, 73, 192, 3, 1, "SAME", 1, False, False),
+        (1, 192, 35, 48, 1, 1, "SAME", 1, False, False),
+        (1, 192, 35, 64, 1, 1, "VALID", 1, False, False),
+        (1, 192, 35, 32, 1, 1, "SAME", 1, False, False),
+        (1, 48, 35, 64, 5, 1, "SAME", 1, False, False),
+        (1, 96, 35, 96, 3, 1, "SAME", 1, False, False),
+        (1, 256, 35, 48, 1, 1, "SAME", 1, False, False),
+        (1, 256, 35, 64, 1, 1, "SAME", 1, False, False),
+        (1, 288, 35, 64, 1, 1, "SAME", 1, False, False),
+        (1, 288, 35, 48, 1, 1, "SAME", 1, False, False),
+        (1, 96, 35, 96, 3, 2, "SAME", 1, False, False),
+        (1, 128, 17, 192, 7, 1, "SAME", 2, False, False),
+        (1, 160, 17, 160, 7, 1, "SAME", 1, False, False),
+        (1, 160, 17, 192, 1, 1, "VALID", 1, False, False),
+        (1, 192, 17, 192, 1, 1, "SAME", 1, False, False),
+        (1, 768, 5, 128, 1, 1, "SAME", 1, False, False),
+        (1, 192, 17, 320, 3, 2, "SAME", 1, False, False),
+        (1, 192, 17, 192, 3, 2, "SAME", 1, False, False),
+        (1, 1280, 8, 192, 1, 1, "SAME", 1, False, False),
+        (1, 1280, 8, 384, 1, 1, "SAME", 1, False, False),
+        (1, 1280, 8, 320, 1, 1, "SAME", 1, False, False),
+        (1, 1280, 8, 448, 1, 1, "SAME", 1, False, False),
+        (1, 384, 8, 384, 1, 1, "SAME", 1, False, False),
+        (1, 384, 8, 384, 3, 1, "SAME", 1, False, False),
+        (1, 448, 8, 384, 3, 1, "VALID", 1, False, False),
+        (1, 2048, 8, 320, 1, 1, "SAME", 1, False, False),
+        (1, 2048, 8, 448, 1, 1, "SAME", 1, True, True),
+        (1, 2048, 8, 192, 1, 1, "SAME", 1, True, False),
+        # A trouble case for native schedule
+        (1, 8, 1, 24, 1, 1, "SAME", 1, False, False),
+    ],
+)
+def test_conv2d_NHWC_gemm_int8(params, device):
 
-            C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = schedule([C])
+    with Int8Fallback():
+        target, compute, schedule = device
 
-        if add_bias:
-            tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
+        (
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            add_bias,
+            add_relu,
+        ) = params
+
+        # TODO(ekalda): These combinations hang during compilation
+        failing_cases = [
+            (devices[1], (1, 128, 17, 192, 7, 1, "SAME", 2, False, False)),
+            (devices[1], (1, 160, 17, 160, 7, 1, "SAME", 1, False, False)),
+            (devices[1], (1, 448, 8, 384, 3, 1, "VALID", 1, False, False)), # this one passes but is just incredibly slow
+        ]
+        if (device, params) in failing_cases:
+            return
 
+        print("Compiling for target: %s" % target)
 
-def verify_conv2d_NHWC_gemm_int8(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
-    bias = te.placeholder((num_filter,), name="bias", dtype="int8")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
-    def get_ref_data():
-        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
-        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype)
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
+        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+        padding_sum = pad_top + pad_left + pad_bottom + pad_right
+        print(
+            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
+        )
 
-        return a_np, w_np, b_np, c_np
+        in_height = in_width = in_size
 
-    a_np, w_np, b_np, c_np = get_ref_data()
+        A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8")
+        W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
+        bias = te.placeholder((num_filter,), name="bias", dtype="int8")
 
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved(
-                A, W, (stride, stride), padding, (dilation, dilation), dtype
-            )
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    check_target("llvm")
-
-
-def verify_conv2d_NCHWc_int8(
-    in_dtype,
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-    out_dtype = "int32" if in_dtype == "int8" else "uint32"
-    lo = -128 if in_dtype == "int8" else 0
-    hi = 127 if in_dtype == "int8" else 255
-
-    def check_target(target, compute, schedule, oc_block_factor, build_only):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            print("Skip because int8 intrinsics are not available")
-            return
-
-        bias = te.placeholder(
-            (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype
-        )
+        a_shape = get_const_tuple(A.shape)
+        w_shape = get_const_tuple(W.shape)
         bias_shape = get_const_tuple(bias.shape)
+        dtype = A.dtype
 
         @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
         def get_ref_data():
-            a_np = np.random.randint(low=lo, high=hi, size=a_shape).astype(out_dtype)
-            w_np = np.random.randint(low=lo, high=hi, size=w_shape).astype(out_dtype)
-            b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
-            dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-            c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(
-                out_dtype
-            )
-
-            # convert to NCHWc
-            _, _, out_height, out_width = c_np.shape
-            c_np = c_np.reshape(
-                (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
-            ).transpose(0, 1, 3, 4, 2)
+            a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+            w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+            c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype)
 
             if add_bias:
-                b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
+                b_np = np.random.uniform(size=bias_shape).astype(dtype)
                 c_np += b_np
             if add_relu:
                 c_np = np.maximum(c_np, 0)
@@ -298,378 +172,462 @@ def get_ref_data():
 
         a_np, w_np, b_np, c_np = get_ref_data()
 
-        with tvm.target.Target(target):
-            C = compute(
-                A,
-                W,
-                (stride, stride),
-                padding,
-                (dilation, dilation),
-                "NCHW",
-                "NCHW",
-                out_dtype,
-            )
+        dev = tvm.device(target, 0)
+        with tvm.target.Target(target) as tvm_target:
+            C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
             s = schedule([C])
 
-        a = tvm.nd.array(a_np.astype(dtype), dev)
-        w = tvm.nd.array(w_np.astype(dtype), dev)
-        b = tvm.nd.array(b_np.astype(out_dtype), dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-        if add_bias:
-            compile_args = [A, W, bias, C]
-            run_args = [a, w, b, c]
-        else:
-            compile_args = [A, W, C]
-            run_args = [a, w, c]
-
-        func = tvm.build(
-            s,
-            compile_args,
-            target,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
+            a = tvm.nd.array(a_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
-        if build_only:
-            return
+            build_inputs = [A, W, bias, C] if add_bias else [A, W, C]
+            inference_inputs = (a, w, b, c) if add_bias else (a, w, c)
+
+            func = tvm.build(
+                s,
+                build_inputs,
+                target,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding_sum,
+                    dilation,
+                ),
+            )
 
-        print("Running on target: %s" % target)
+            build_only = tvm_target.features.is_aarch64 and (platform.machine() != "aarch64")
 
-        func(*run_args)
+            if not build_only:
+                print("Running on target: %s" % target)
+                func(*inference_inputs)
+                tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
-    targets = [
-        (
-            "cuda",
-            lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-            topi.cuda.schedule_conv2d_NCHWc_int8,
-            4,
-            False,
-        ),
-        # Disable on CI since it does not support spirv int8 dot product
-        # (
-        #     "vulkan -from_device=0",
-        #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-        #     topi.cuda.schedule_conv2d_NCHWc_int8,
-        #     4,
-        #     False,
-        # ),
-    ]
-
-    build_only_aarch64 = platform.machine() != "aarch64"
-
-    targets.append(
+@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
+@pytest.mark.parametrize(
+    "params",
+    [
+        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
+        (1, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (1, 64, 56, 64, 1, 1, 0, 1, False, False),
+        (1, 64, 56, 128, 3, 2, 1, 1, False, False),
+        (1, 64, 56, 128, 1, 2, 0, 1, False, False),
+        (1, 128, 28, 128, 3, 1, 1, 1, False, False),
+        (1, 128, 28, 256, 3, 2, 1, 1, False, False),
+        (1, 128, 28, 256, 1, 2, 0, 1, False, False),
+        (1, 256, 14, 256, 3, 1, 1, 1, False, False),
+        (1, 256, 14, 512, 3, 2, 1, 1, False, False),
+        (1, 256, 14, 512, 1, 2, 0, 1, False, False),
+        (1, 512, 7, 512, 3, 1, 1, 1, False, False),
+        # bias, relu
+        (1, 64, 56, 64, 3, 1, 1, 1, False, True),
+        (1, 64, 56, 64, 3, 1, 1, 1, True, False),
+        (1, 64, 56, 64, 3, 1, 1, 1, True, True),
+        # dilation = 2
+        (1, 64, 56, 64, 3, 1, 1, 2, False, False),
+        # batch size
+        (4, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (9, 64, 56, 64, 3, 1, 1, 1, False, False),
+        # weird workloads
+        (4, 4, 4, 8, 4, 4, 4, 1, False, False),
+        # inception v3 workloads where channels in / out are multiple of oc_block_factor
+        (1, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (1, 32, 147, 64, 3, 1, 1, 1, False, False),
+        (1, 64, 73, 80, 1, 1, 0, 1, False, False),
+        (1, 80, 73, 192, 3, 1, 0, 1, False, False),
+        (1, 192, 35, 64, 1, 1, 0, 1, False, False),
+        (1, 192, 35, 48, 1, 1, 0, 1, False, False),
+        (1, 48, 35, 64, 5, 1, 2, 1, False, False),
+        (1, 64, 35, 96, 3, 1, 1, 1, False, False),
+        (1, 96, 35, 96, 3, 1, 1, 1, False, False),
+        (1, 192, 35, 32, 1, 1, 0, 1, False, False),
+        (1, 256, 35, 64, 1, 1, 0, 1, False, False),
+        (1, 256, 35, 48, 1, 1, 0, 1, False, False),
+        (1, 288, 35, 64, 1, 1, 0, 1, False, False),
+        (1, 288, 35, 48, 1, 1, 0, 1, False, False),
+        (1, 288, 35, 384, 3, 2, 0, 1, False, False),
+        (1, 96, 35, 96, 3, 2, 0, 1, False, False),
+        (1, 768, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 768, 17, 128, 1, 1, 0, 1, False, False),
+        (1, 128, 17, 128, 1, 1, 0, 1, False, False),
+        (1, 128, 17, 192, 7, 1, 3, 1, False, False),
+        (1, 128, 17, 128, 7, 1, 3, 1, False, False),
+        (1, 128, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 768, 17, 160, 1, 1, 0, 1, False, False),
+        (1, 160, 17, 160, 1, 1, 0, 1, False, False),
+        (1, 160, 17, 192, 7, 1, 3, 1, False, False),
+        (1, 160, 17, 160, 7, 1, 3, 1, False, False),
+        (1, 160, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 192, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 192, 17, 192, 7, 1, 3, 1, False, False),
+        (1, 192, 17, 320, 3, 2, 0, 1, False, False),
+        (1, 192, 17, 192, 3, 2, 0, 1, False, False),
+        (1, 1280, 8, 320, 1, 1, 0, 1, False, False),
+        (1, 1280, 8, 384, 1, 1, 0, 1, False, False),
+        (1, 384, 8, 384, 1, 1, 0, 1, False, False),
+        (1, 384, 8, 384, 3, 1, 1, 1, False, False),
+        (1, 1280, 8, 448, 1, 1, 0, 1, False, False),
+        (1, 448, 8, 384, 3, 1, 1, 1, False, False),
+        (1, 1280, 8, 192, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 320, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 384, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 448, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 192, 1, 1, 0, 1, False, False),
+        (1, 1024, 19, 88, 3, 1, 1, 1, False, False),
+        # batch > 1
+        (7, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (8, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (32, 32, 149, 32, 3, 1, 0, 1, False, False),
+        # Asymmetric padding
+        (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
+        (1, 64, 8, 128, 3, 1, (3, 3, 2, 2), 1, False, False),
+        (1, 64, 8, 64, 1, 1, (1, 2, 2, 1), 1, False, False),
+        (1, 64, 17, 192, 1, 1, (1, 2), 1, False, False),
+        (1, 64, 8, 64, 3, 1, (3, 1), 1, False, False),
+        (1, 128, 8, 384, 3, 1, (0, 2), 1, False, False),
+        (1, 64, 8, 64, 1, 1, "VALID", 1, False, False),
+        (1, 392, 8, 64, 3, 1, "VALID", 1, False, False),
+        (1, 512, 19, 64, 1, 1, "SAME", 1, False, False),
+        (1, 64, 16, 32, 2, 1, "SAME", 1, False, False),
+        (1, 64, 8, 64, 3, 1, (1, 2, 2, 1), 1, False, True),
+        (1, 64, 8, 64, 5, 2, (1, 3), 1, True, False),
+        (1, 64, 56, 64, 3, 1, "VALID", 1, True, True),
+        (1, 64, 56, 64, 24, 1, "SAME", 1, True, True),
+    ],
+)
+def test_conv2d_NCHWc_int8(in_dtype, params):
+    with Int8Fallback():
         (
-            "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
-            topi.arm_cpu.conv2d_NCHWc_int8,
-            topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-            8,
-            build_only_aarch64,
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            add_bias,
+            add_relu,
+        ) = params
+        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+        padding_sum = pad_top + pad_left + pad_bottom + pad_right
+        print(
+            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
         )
-    )
-
-    if in_dtype == "int8":
-        targets += [
-            (
-                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
-                topi.arm_cpu.conv2d_NCHWc_int8,
-                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-                8,
-                build_only_aarch64,
-            ),
-            (
-                "rocm -mattr=+dotprod",
-                lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-                topi.cuda.schedule_conv2d_NCHWc_int8,
-                4,
-                False,
-            ),
-        ]
-
-    for target, compute, schedule, oc_block_factor, build_only in targets:
-        check_target(target, compute, schedule, oc_block_factor, build_only)
-
-
-def verify_conv2d_nchw_int8(
-    in_dtype,
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
-    bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
-    def get_ref_data():
-        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
-        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def verify_workload_padding():
-        _, _, out_height, out_width = get_const_tuple(c_np.shape)
-        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
-
-        # for testing functionality,
-        # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
-        # regardless of the performance.
-        int32_lanes, num_int8_elements = num_filter, in_channel
-
-        # check if tile_ow candidates are the factors of the right output weight.
-        cfg = autotvm.get_config()
-        fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements)
-        ow_tile = np.prod(cfg["tile_ow"].size)
 
-        tvm.testing.assert_allclose(ow_tile, out_width)
+        in_height = in_width = in_size
+
+        A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
+        W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
+
+        a_shape = get_const_tuple(A.shape)
+        w_shape = get_const_tuple(W.shape)
+        dtype = A.dtype
+        out_dtype = "int32" if in_dtype == "int8" else "uint32"
+        lo = -128 if in_dtype == "int8" else 0
+        hi = 127 if in_dtype == "int8" else 255
+
+        def check_target(target, compute, schedule, oc_block_factor, build_only):
+            dev = tvm.device(target, 0)
+            if not tvm.testing.device_enabled(target):
+                print("Skip because %s is not enabled" % target)
+                return
+            if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
+                print("Skip because int8 intrinsics are not available")
+                return
+
+            bias = te.placeholder(
+                (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype
+            )
+            bias_shape = get_const_tuple(bias.shape)
 
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            print("Skip because int8 intrinsics are not available")
-            return
+            @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+            def get_ref_data():
+                a_np = np.random.randint(low=lo, high=hi, size=a_shape).astype(out_dtype)
+                w_np = np.random.randint(low=lo, high=hi, size=w_shape).astype(out_dtype)
+                b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
+                dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+                c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(
+                    out_dtype
+                )
+
+                # convert to NCHWc
+                _, _, out_height, out_width = c_np.shape
+                c_np = c_np.reshape(
+                    (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
+                ).transpose(0, 1, 3, 4, 2)
+
+                if add_bias:
+                    b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
+                    c_np += b_np
+                if add_relu:
+                    c_np = np.maximum(c_np, 0)
+
+                return a_np, w_np, b_np, c_np
+
+            a_np, w_np, b_np, c_np = get_ref_data()
+
+            with tvm.target.Target(target):
+                C = compute(
+                    A,
+                    W,
+                    (stride, stride),
+                    padding,
+                    (dilation, dilation),
+                    "NCHW",
+                    "NCHW",
+                    out_dtype,
+                )
+                if add_bias:
+                    C = topi.add(C, bias)
+                if add_relu:
+                    C = topi.nn.relu(C)
+                s = schedule([C])
+
+            a = tvm.nd.array(a_np.astype(dtype), dev)
+            w = tvm.nd.array(w_np.astype(dtype), dev)
+            b = tvm.nd.array(b_np.astype(out_dtype), dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            C = topi.cuda.conv2d_nchw_int8(
-                A, W, (stride, stride), padding, (dilation, dilation), dtype
-            )
             if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.cuda.schedule_conv2d_nchw_int8([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
+                compile_args = [A, W, bias, C]
+                run_args = [a, w, b, c]
+            else:
+                compile_args = [A, W, C]
+                run_args = [a, w, c]
+
             func = tvm.build(
                 s,
-                [A, W, C],
+                compile_args,
                 target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
-    verify_workload_padding()
+            if build_only:
+                return
 
-    for target in ["cuda"]:
-        check_target(target)
+            print("Running on target: %s" % target)
 
+            func(*run_args)
 
-@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
-def test_conv2d_nchw(in_dtype):
-    with Int8Fallback():
-        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 3, 2, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 1, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 128, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 3, 2, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 1, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 256, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 3, 2, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 1, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 7, 512, 3, 1, 1)
+            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
-        # bias, relu
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_relu=True)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_bias=True)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+        targets = [
+            (
+                "cuda",
+                lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+                topi.cuda.schedule_conv2d_NCHWc_int8,
+                4,
+                False,
+            ),
+            # Disable on CI since it does not support spirv int8 dot product
+            # (
+            #     "vulkan -from_device=0",
+            #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+            #     topi.cuda.schedule_conv2d_NCHWc_int8,
+            #     4,
+            #     False,
+            # ),
+        ]
 
-        # dilation = 2
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2)
+        build_only_aarch64 = platform.machine() != "aarch64"
 
-        # batch size
-        verify_conv2d_NCHWc_int8(in_dtype, 4, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1)
+        targets.append(
+            (
+                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
+                topi.arm_cpu.conv2d_NCHWc_int8,
+                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+                8,
+                build_only_aarch64,
+            )
+        )
 
-        # weird workloads
-        verify_conv2d_NCHWc_int8(in_dtype, 4, 4, 4, 8, 4, 4, 4)
+        if in_dtype == "int8":
+            targets += [
+                (
+                    "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
+                    topi.arm_cpu.conv2d_NCHWc_int8,
+                    topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+                    8,
+                    build_only_aarch64,
+                ),
+                (
+                    "rocm -mattr=+dotprod",
+                    lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(
+                        a, w, s, p, d, l, o
+                    ),
+                    topi.cuda.schedule_conv2d_NCHWc_int8,
+                    4,
+                    False,
+                ),
+            ]
+
+        for target, compute, schedule, oc_block_factor, build_only in targets:
+            check_target(target, compute, schedule, oc_block_factor, build_only)
+
+
+# Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So, just
+# performing basic testing - one test for all different scenarios - batch, dilation etc..
+@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
+@pytest.mark.parametrize(
+    "params",
+    [
+        (1, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (1, 64, 56, 64, 3, 1, 1, 1, False, True),
+        (1, 64, 56, 64, 3, 1, 1, 2, False, False),
+        (9, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (4, 4, 4, 4, 4, 4, 4, 1, False, False),
+        (1, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (7, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
+        (1, 32, 35, 64, 7, 2, (0, 0, 2, 2), 1, False, False),
+    ],
+)
+def test_conv2d_nchw_int8(in_dtype, params):
+    with Int8Fallback():
+        (
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            add_bias,
+            add_relu,
+        ) = params
+        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+        padding_sum = pad_top + pad_left + pad_bottom + pad_right
+        print(
+            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
+        )
 
-        # inception v3 workloads where channels in / out are multiple of oc_block_factor
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 147, 64, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 73, 80, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 80, 73, 192, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 48, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 48, 35, 64, 5, 1, 2)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 35, 96, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 32, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 48, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 48, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 384, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 128, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 160, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 320, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 320, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 384, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 448, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 448, 8, 384, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 320, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 384, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 448, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1024, 19, 88, 3, 1, 1)
+        in_height = in_width = in_size
 
-        # batch > 1
-        verify_conv2d_NCHWc_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 8, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 32, 32, 149, 32, 3, 1, 0)
+        A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
+        W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
+        bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)
 
-        # Asymmetric padding
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 128, 3, 1, (3, 3, 2, 2))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, (1, 2, 2, 1))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 17, 192, 1, 1, (1, 2))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (3, 1))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 8, 384, 3, 1, (0, 2))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, "VALID")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 392, 8, 64, 3, 1, "VALID")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 19, 64, 1, 1, "SAME")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 16, 32, 2, 1, "SAME")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (1, 2, 2, 1), add_relu=True)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 5, 2, (1, 3), add_bias=True)
-        verify_conv2d_NCHWc_int8(
-            in_dtype, 1, 64, 56, 64, 3, 1, "VALID", add_bias=True, add_relu=True
-        )
-        verify_conv2d_NCHWc_int8(
-            in_dtype, 1, 64, 56, 64, 24, 1, "SAME", add_bias=True, add_relu=True
-        )
+        a_shape = get_const_tuple(A.shape)
+        w_shape = get_const_tuple(W.shape)
+        bias_shape = get_const_tuple(bias.shape)
+        dtype = A.dtype
+
+        @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+        def get_ref_data():
+            a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+            w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+            c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
 
-        # Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So, just
-        # performing basic testing - one test for all different scenarios - batch, dilation etc..
-        verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_relu=True)
-        verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2)
-        verify_conv2d_nchw_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_nchw_int8(in_dtype, 4, 4, 4, 4, 4, 4, 4)
-        verify_conv2d_nchw_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_nchw_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
-        verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 2, 2))
+            if add_bias:
+                b_np = np.random.uniform(size=bias_shape).astype(dtype)
+                c_np += b_np
+            if add_relu:
+                c_np = np.maximum(c_np, 0)
 
+            return a_np, w_np, b_np, c_np
 
-def test_conv2d_nhwc():
-    with Int8Fallback():
-        # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID' padding)
-        verify_conv2d_NHWC_gemm_int8(1, 3, 299, 32, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 32, 149, 32, 3, 1, "SAME", dilation=2)
-        verify_conv2d_NHWC_gemm_int8(4, 32, 147, 64, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 64, 73, 80, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 80, 73, 192, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 35, 48, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 35, 64, 1, 1, "VALID")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 35, 32, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 48, 35, 64, 5, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 256, 35, 48, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 256, 35, 64, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 288, 35, 64, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 288, 35, 48, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 128, 17, 192, 7, 1, "SAME", dilation=2)
-        verify_conv2d_NHWC_gemm_int8(1, 160, 17, 160, 7, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 160, 17, 192, 1, 1, "VALID")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 768, 5, 128, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 17, 320, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 192, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 384, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 320, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 448, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 448, 8, 384, 3, 1, "VALID")
-        verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 320, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 448, 1, 1, "SAME", add_bias=True, add_relu=True)
-        verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 192, 1, 1, "SAME", add_bias=True)
-
-        # Let's also verify that it compiles fine on AArch64 targets
-        compile_conv2d_NHWC_gemm_int8_arm(1, 3, 299, 32, 3, 2, "SAME")
+        a_np, w_np, b_np, c_np = get_ref_data()
+
+        def verify_workload_padding():
+            _, _, out_height, out_width = get_const_tuple(c_np.shape)
+            wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
+
+            # for testing functionality,
+            # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
+            # regardless of the performance.
+            int32_lanes, num_int8_elements = num_filter, in_channel
+
+            # check if tile_ow candidates are the factors of the right output weight.
+            cfg = autotvm.get_config()
+            fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements)
+            ow_tile = np.prod(cfg["tile_ow"].size)
+
+            tvm.testing.assert_allclose(ow_tile, out_width)
+
+        def check_target(target):
+            dev = tvm.device(target, 0)
+            if not tvm.testing.device_enabled(target):
+                print("Skip because %s is not enabled" % target)
+                return
+            if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
+                print("Skip because int8 intrinsics are not available")
+                return
+
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                C = topi.cuda.conv2d_nchw_int8(
+                    A, W, (stride, stride), padding, (dilation, dilation), dtype
+                )
+                if add_bias:
+                    C = topi.add(C, bias)
+                if add_relu:
+                    C = topi.nn.relu(C)
+                s = topi.cuda.schedule_conv2d_nchw_int8([C])
+
+            a = tvm.nd.array(a_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+            if add_bias:
+                func = tvm.build(
+                    s,
+                    [A, W, bias, C],
+                    target,
+                    name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                    % (
+                        batch,
+                        in_channel,
+                        in_size,
+                        num_filter,
+                        kernel,
+                        stride,
+                        padding_sum,
+                        dilation,
+                    ),
+                )
+                func(a, w, b, c)
+            else:
+                func = tvm.build(
+                    s,
+                    [A, W, C],
+                    target,
+                    name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                    % (
+                        batch,
+                        in_channel,
+                        in_size,
+                        num_filter,
+                        kernel,
+                        stride,
+                        padding_sum,
+                        dilation,
+                    ),
+                )
+                func(a, w, c)
+            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+        verify_workload_padding()
+
+        for target in ["cuda"]:
+            check_target(target)
 
 
 if __name__ == "__main__":

From 8e168f635789c18bc63ad621fc3b37d103e8ea7f Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Wed, 28 Dec 2022 10:25:07 +0000
Subject: [PATCH 2/3] Linting...

---
 tests/python/topi/python/test_topi_conv2d_int8.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 636241a5fc02..5d61bc4a8165 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -129,7 +129,10 @@ def test_conv2d_NHWC_gemm_int8(params, device):
         failing_cases = [
             (devices[1], (1, 128, 17, 192, 7, 1, "SAME", 2, False, False)),
             (devices[1], (1, 160, 17, 160, 7, 1, "SAME", 1, False, False)),
-            (devices[1], (1, 448, 8, 384, 3, 1, "VALID", 1, False, False)), # this one passes but is just incredibly slow
+            (
+                devices[1],
+                (1, 448, 8, 384, 3, 1, "VALID", 1, False, False),
+            ),  # this one passes but is just incredibly slow
         ]
         if (device, params) in failing_cases:
             return

From 2f2ecafe9174ef63b70111a6fbccdf0b66e2b508 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Thu, 29 Dec 2022 11:35:26 +0000
Subject: [PATCH 3/3] More testing cleanup

---
 python/tvm/topi/nn/conv2d.py                  |  10 +-
 .../topi/python/test_topi_conv2d_int8.py      | 173 ++++++++----------
 2 files changed, 88 insertions(+), 95 deletions(-)

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 42d5ee3f2710..92b5a90e5b11 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -606,8 +606,14 @@ def conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols):
     if N % tile_rows != 0:
         pad_N = tile_rows - (N % tile_rows)
 
-    if K % (tile_cols * 4) != 0:
-        pad_K = (tile_cols * 4) - (K % (tile_cols * 4))
+    # Tensorize will later make use of 4 tiles at once across the columns so make sure we pad such
+    # that the columns is multiple of 4
+    column_multiplier = 4
+    tile_cols_multiplied = tile_cols * column_multiplier
+    K_misalignment = K % tile_cols_multiplied
+
+    if K_misalignment != 0:
+        pad_K = tile_cols_multiplied - K_misalignment
 
     N_padded = N + pad_N
     K_padded = K + pad_K
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 5d61bc4a8165..e05dba3dfee4 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -28,6 +28,7 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.conv2d import _get_workload
 from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8
+from tvm.testing.aot import get_dtype_range
 
 from common import Int8Fallback
 import tvm.testing
@@ -125,6 +126,8 @@ def test_conv2d_NHWC_gemm_int8(params, device):
             add_relu,
         ) = params
 
+        dtype = "int8"
+
         # TODO(ekalda): These combinations hang during compilation
         failing_cases = [
             (devices[1], (1, 128, 17, 192, 7, 1, "SAME", 2, False, False)),
@@ -135,7 +138,7 @@ def test_conv2d_NHWC_gemm_int8(params, device):
             ),  # this one passes but is just incredibly slow
         ]
         if (device, params) in failing_cases:
-            return
+            pytest.skip("Skipping because this test will hang")
 
         print("Compiling for target: %s" % target)
 
@@ -148,19 +151,15 @@ def test_conv2d_NHWC_gemm_int8(params, device):
 
         in_height = in_width = in_size
 
-        A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8")
-        W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
-        bias = te.placeholder((num_filter,), name="bias", dtype="int8")
-
-        a_shape = get_const_tuple(A.shape)
-        w_shape = get_const_tuple(W.shape)
-        bias_shape = get_const_tuple(bias.shape)
-        dtype = A.dtype
+        a_shape = (batch, in_height, in_width, in_channel)
+        w_shape = (kernel, kernel, in_channel, num_filter)
+        bias_shape = (num_filter,)
 
-        @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+        @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NHWC_gemm_int8")
         def get_ref_data():
-            a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
-            w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+            input_min, input_max = get_dtype_range(dtype)
+            a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype(dtype)
+            w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype(dtype)
             b_np = np.random.uniform(size=bias_shape).astype(dtype)
             dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
             c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype)
@@ -173,10 +172,10 @@ def get_ref_data():
 
             return a_np, w_np, b_np, c_np
 
-        a_np, w_np, b_np, c_np = get_ref_data()
-
-        dev = tvm.device(target, 0)
         with tvm.target.Target(target) as tvm_target:
+            A = te.placeholder(a_shape, name="A", dtype=dtype)
+            W = te.placeholder(w_shape, name="W", dtype=dtype)
+            bias = te.placeholder(bias_shape, name="bias", dtype=dtype)
             C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
             if add_bias:
                 C = topi.add(C, bias)
@@ -184,17 +183,11 @@ def get_ref_data():
                 C = topi.nn.relu(C)
             s = schedule([C])
 
-            a = tvm.nd.array(a_np, dev)
-            w = tvm.nd.array(w_np, dev)
-            b = tvm.nd.array(b_np, dev)
-            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-            build_inputs = [A, W, bias, C] if add_bias else [A, W, C]
-            inference_inputs = (a, w, b, c) if add_bias else (a, w, c)
+            build_args = [A, W, bias, C] if add_bias else [A, W, C]
 
             func = tvm.build(
                 s,
-                build_inputs,
+                build_args,
                 target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (
@@ -211,10 +204,22 @@ def get_ref_data():
 
             build_only = tvm_target.features.is_aarch64 and (platform.machine() != "aarch64")
 
-            if not build_only:
-                print("Running on target: %s" % target)
-                func(*inference_inputs)
-                tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+            if build_only:
+                return
+
+            print("Running on target: %s" % target)
+
+            dev = tvm.device(target, 0)
+            a_np, w_np, b_np, c_np = get_ref_data()
+            a = tvm.nd.array(a_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+
+            run_args = [a, w, b, c] if add_bias else [a, w, c]
+            func(*run_args)
+
+            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
 
 @pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
@@ -339,27 +344,28 @@ def test_conv2d_NCHWc_int8(in_dtype, params):
         w_shape = get_const_tuple(W.shape)
         dtype = A.dtype
         out_dtype = "int32" if in_dtype == "int8" else "uint32"
-        lo = -128 if in_dtype == "int8" else 0
-        hi = 127 if in_dtype == "int8" else 255
+        input_min, input_max = get_dtype_range(in_dtype)
 
         def check_target(target, compute, schedule, oc_block_factor, build_only):
             dev = tvm.device(target, 0)
             if not tvm.testing.device_enabled(target):
-                print("Skip because %s is not enabled" % target)
-                return
+                pytest.skip(reason="Skip because %s is not enabled" % target)
             if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-                print("Skip because int8 intrinsics are not available")
-                return
+                pytest.skip(reason="Skip because %s is not enabled" % target)
 
             bias = te.placeholder(
                 (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype
             )
             bias_shape = get_const_tuple(bias.shape)
 
-            @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+            @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NCHWc_int8")
             def get_ref_data():
-                a_np = np.random.randint(low=lo, high=hi, size=a_shape).astype(out_dtype)
-                w_np = np.random.randint(low=lo, high=hi, size=w_shape).astype(out_dtype)
+                a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype(
+                    out_dtype
+                )
+                w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype(
+                    out_dtype
+                )
                 b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
                 dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
                 c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(
@@ -380,8 +386,6 @@ def get_ref_data():
 
                 return a_np, w_np, b_np, c_np
 
-            a_np, w_np, b_np, c_np = get_ref_data()
-
             with tvm.target.Target(target):
                 C = compute(
                     A,
@@ -399,17 +403,7 @@ def get_ref_data():
                     C = topi.nn.relu(C)
                 s = schedule([C])
 
-            a = tvm.nd.array(a_np.astype(dtype), dev)
-            w = tvm.nd.array(w_np.astype(dtype), dev)
-            b = tvm.nd.array(b_np.astype(out_dtype), dev)
-            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-            if add_bias:
-                compile_args = [A, W, bias, C]
-                run_args = [a, w, b, c]
-            else:
-                compile_args = [A, W, C]
-                run_args = [a, w, c]
+            compile_args = [A, W, bias, C] if add_bias else [A, W, C]
 
             func = tvm.build(
                 s,
@@ -422,6 +416,14 @@ def get_ref_data():
             if build_only:
                 return
 
+            a_np, w_np, b_np, c_np = get_ref_data()
+
+            a = tvm.nd.array(a_np.astype(dtype), dev)
+            w = tvm.nd.array(w_np.astype(dtype), dev)
+            b = tvm.nd.array(b_np.astype(out_dtype), dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+            run_args = [a, w, b, c] if add_bias else [a, w, c]
+
             print("Running on target: %s" % target)
 
             func(*run_args)
@@ -531,7 +533,7 @@ def test_conv2d_nchw_int8(in_dtype, params):
         bias_shape = get_const_tuple(bias.shape)
         dtype = A.dtype
 
-        @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+        @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_nchw_int8")
         def get_ref_data():
             a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
             w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
@@ -550,7 +552,7 @@ def get_ref_data():
         a_np, w_np, b_np, c_np = get_ref_data()
 
         def verify_workload_padding():
-            _, _, out_height, out_width = get_const_tuple(c_np.shape)
+            _, _, _, out_width = get_const_tuple(c_np.shape)
             wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
 
             # for testing functionality,
@@ -568,11 +570,9 @@ def verify_workload_padding():
         def check_target(target):
             dev = tvm.device(target, 0)
             if not tvm.testing.device_enabled(target):
-                print("Skip because %s is not enabled" % target)
-                return
+                pytest.skip("Skip because %s is not enabled" % target)
             if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-                print("Skip because int8 intrinsics are not available")
-                return
+                pytest.skip("Skip because int8 intrinsics are not available")
 
             print("Running on target: %s" % target)
             with tvm.target.Target(target):
@@ -585,52 +585,39 @@ def check_target(target):
                     C = topi.nn.relu(C)
                 s = topi.cuda.schedule_conv2d_nchw_int8([C])
 
+            build_args = [A, W, bias, C] if add_bias else [A, W, C]
+
+            func = tvm.build(
+                s,
+                build_args,
+                target,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding_sum,
+                    dilation,
+                ),
+            )
+
             a = tvm.nd.array(a_np, dev)
             w = tvm.nd.array(w_np, dev)
             b = tvm.nd.array(b_np, dev)
             c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-            if add_bias:
-                func = tvm.build(
-                    s,
-                    [A, W, bias, C],
-                    target,
-                    name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                    % (
-                        batch,
-                        in_channel,
-                        in_size,
-                        num_filter,
-                        kernel,
-                        stride,
-                        padding_sum,
-                        dilation,
-                    ),
-                )
-                func(a, w, b, c)
-            else:
-                func = tvm.build(
-                    s,
-                    [A, W, C],
-                    target,
-                    name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                    % (
-                        batch,
-                        in_channel,
-                        in_size,
-                        num_filter,
-                        kernel,
-                        stride,
-                        padding_sum,
-                        dilation,
-                    ),
-                )
-                func(a, w, c)
+
+            run_args = [a, w, b, c] if add_bias else [a, w, c]
+
+            func(*run_args)
+
             tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
         verify_workload_padding()
 
-        for target in ["cuda"]:
-            check_target(target)
+        check_target("cuda")
 
 
 if __name__ == "__main__":