diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index db1bcaa27694..92b5a90e5b11 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -606,8 +606,14 @@ def conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols): if N % tile_rows != 0: pad_N = tile_rows - (N % tile_rows) - if K % tile_cols != 0: - pad_K = tile_cols - (K % tile_cols) + # Tensorize will later make use of 4 tiles at once across the columns so make sure we pad such + # that the columns is multiple of 4 + column_multiplier = 4 + tile_cols_multiplied = tile_cols * column_multiplier + K_misalignment = K % tile_cols_multiplied + + if K_misalignment != 0: + pad_K = tile_cols_multiplied - K_misalignment N_padded = N + pad_N K_padded = K + pad_K diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index c84f39ab5a66..e05dba3dfee4 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -28,6 +28,7 @@ from tvm.topi.utils import get_const_tuple from tvm.topi.nn.conv2d import _get_workload from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8 +from tvm.testing.aot import get_dtype_range from common import Int8Fallback import tvm.testing @@ -35,67 +36,146 @@ import platform -def compile_conv2d_NHWC_gemm_int8_arm( - batch, - in_channel, - in_size, - num_filter, - kernel, - stride, - padding, - dilation=1, - add_bias=False, - add_relu=False, -): - pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) - padding_sum = pad_top + pad_left + pad_bottom + pad_right - print( - "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) - ) - - in_height = in_width = in_size - A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8") - W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") - bias = te.placeholder((num_filter,), name="bias", dtype="int8") - dtype = "int32" - devices = [ - ( - "llvm --device arm_cpu --mtriple aarch64-linux-gnu", - topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, - topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, - ), - ( - "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", - topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, - topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, - ), +devices = [ + ( + "llvm", + topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + ), + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu", + topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + ), + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", + topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + ), + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", + topi.arm_cpu.compute_conv2d_NHWC_quantized_native, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_native, + ), + # TODO(giuseros) We need LLVM-11 in order to compile with +i8mm extension + # ( + # "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm", + # topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + # topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + # ), +] + + +@tvm.testing.requires_llvm +@pytest.mark.parametrize("device", devices) +@pytest.mark.parametrize( + "params", + [ + # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID' padding) + (1, 3, 299, 32, 3, 2, "SAME", 1, False, False), + (1, 32, 149, 32, 3, 1, "SAME", 2, False, False), + (4, 32, 147, 64, 3, 1, "SAME", 1, False, False), + (1, 64, 73, 80, 1, 1, "SAME", 1, False, False), + (1, 80, 73, 192, 3, 1, "SAME", 1, False, False), + (1, 192, 35, 48, 1, 1, "SAME", 1, False, False), + (1, 192, 35, 64, 1, 1, "VALID", 1, False, False), + (1, 192, 35, 32, 1, 1, "SAME", 1, False, False), + (1, 48, 35, 64, 5, 1, "SAME", 1, False, False), + (1, 96, 35, 96, 3, 1, "SAME", 1, False, False), + (1, 256, 35, 48, 1, 1, "SAME", 1, False, False), + (1, 256, 35, 64, 1, 1, "SAME", 1, False, False), + (1, 288, 35, 64, 1, 1, "SAME", 1, False, False), + (1, 288, 35, 48, 1, 1, "SAME", 1, False, False), + (1, 96, 35, 96, 3, 2, "SAME", 1, False, False), + (1, 128, 17, 192, 7, 1, "SAME", 2, False, False), + (1, 160, 17, 160, 7, 1, "SAME", 1, False, False), + (1, 160, 17, 192, 1, 1, "VALID", 1, False, False), + (1, 192, 17, 192, 1, 1, "SAME", 1, False, False), + (1, 768, 5, 128, 1, 1, "SAME", 1, False, False), + (1, 192, 17, 320, 3, 2, "SAME", 1, False, False), + (1, 192, 17, 192, 3, 2, "SAME", 1, False, False), + (1, 1280, 8, 192, 1, 1, "SAME", 1, False, False), + (1, 1280, 8, 384, 1, 1, "SAME", 1, False, False), + (1, 1280, 8, 320, 1, 1, "SAME", 1, False, False), + (1, 1280, 8, 448, 1, 1, "SAME", 1, False, False), + (1, 384, 8, 384, 1, 1, "SAME", 1, False, False), + (1, 384, 8, 384, 3, 1, "SAME", 1, False, False), + (1, 448, 8, 384, 3, 1, "VALID", 1, False, False), + (1, 2048, 8, 320, 1, 1, "SAME", 1, False, False), + (1, 2048, 8, 448, 1, 1, "SAME", 1, True, True), + (1, 2048, 8, 192, 1, 1, "SAME", 1, True, False), + # A trouble case for native schedule + (1, 8, 1, 24, 1, 1, "SAME", 1, False, False), + ], +) +def test_conv2d_NHWC_gemm_int8(params, device): + + with Int8Fallback(): + target, compute, schedule = device + ( - "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", - topi.arm_cpu.compute_conv2d_NHWC_quantized_native, - topi.arm_cpu.schedule_conv2d_NHWC_quantized_native, - ), - # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension - # ( - # "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm", - # topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, - # topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, - # ), - ] - - for device_tuple in devices: - target = device_tuple[0] - compute = device_tuple[1] - schedule = device_tuple[2] - - dev = tvm.device(target, 0) - if not tvm.testing.device_enabled(target): - print("Skip because %s is not enabled" % target) - return - print("Compiling on arm AArch64 target: %s" % target) - with tvm.target.Target(target) as tvm_target: - assert tvm_target.features.is_aarch64, "AArch64 target not recognized" + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding, + dilation, + add_bias, + add_relu, + ) = params + + dtype = "int8" + + # TODO(ekalda): These combinations hang during compilation + failing_cases = [ + (devices[1], (1, 128, 17, 192, 7, 1, "SAME", 2, False, False)), + (devices[1], (1, 160, 17, 160, 7, 1, "SAME", 1, False, False)), + ( + devices[1], + (1, 448, 8, 384, 3, 1, "VALID", 1, False, False), + ), # this one passes but is just incredibly slow + ] + if (device, params) in failing_cases: + pytest.skip("Skipping because this test will hang") + + print("Compiling for target: %s" % target) + + pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) + padding_sum = pad_top + pad_left + pad_bottom + pad_right + print( + "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) + ) + + in_height = in_width = in_size + + a_shape = (batch, in_height, in_width, in_channel) + w_shape = (kernel, kernel, in_channel, num_filter) + bias_shape = (num_filter,) + + @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NHWC_gemm_int8") + def get_ref_data(): + input_min, input_max = get_dtype_range(dtype) + a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype(dtype) + w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype(dtype) + b_np = np.random.uniform(size=bias_shape).astype(dtype) + dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1)) + c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype) + + if add_bias: + b_np = np.random.uniform(size=bias_shape).astype(dtype) + c_np += b_np + if add_relu: + c_np = np.maximum(c_np, 0) + return a_np, w_np, b_np, c_np + + with tvm.target.Target(target) as tvm_target: + A = te.placeholder(a_shape, name="A", dtype=dtype) + W = te.placeholder(w_shape, name="W", dtype=dtype) + bias = te.placeholder(bias_shape, name="bias", dtype=dtype) C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) @@ -103,573 +183,441 @@ def compile_conv2d_NHWC_gemm_int8_arm( C = topi.nn.relu(C) s = schedule([C]) - if add_bias: - tvm.build( - s, - [A, W, bias, C], - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func = tvm.build( - s, - [A, W, bias, C], - target, - name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - else: + build_args = [A, W, bias, C] if add_bias else [A, W, C] + func = tvm.build( s, - [A, W, C], + build_args, target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + % ( + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding_sum, + dilation, + ), ) + build_only = tvm_target.features.is_aarch64 and (platform.machine() != "aarch64") -def verify_conv2d_NHWC_gemm_int8( - batch, - in_channel, - in_size, - num_filter, - kernel, - stride, - padding, - dilation=1, - add_bias=False, - add_relu=False, -): - pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) - padding_sum = pad_top + pad_left + pad_bottom + pad_right - print( - "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) - ) - - in_height = in_width = in_size - - A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8") - W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") - bias = te.placeholder((num_filter,), name="bias", dtype="int8") - - a_shape = get_const_tuple(A.shape) - w_shape = get_const_tuple(W.shape) - bias_shape = get_const_tuple(bias.shape) - dtype = A.dtype - - @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") - def get_ref_data(): - a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) - w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) - b_np = np.random.uniform(size=bias_shape).astype(dtype) - dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1)) - c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype) - - if add_bias: - b_np = np.random.uniform(size=bias_shape).astype(dtype) - c_np += b_np - if add_relu: - c_np = np.maximum(c_np, 0) - - return a_np, w_np, b_np, c_np - - a_np, w_np, b_np, c_np = get_ref_data() - - def check_target(target): - dev = tvm.device(target, 0) - if not tvm.testing.device_enabled(target): - print("Skip because %s is not enabled" % target) - return - print("Running on target: %s" % target) - with tvm.target.Target(target): - C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved( - A, W, (stride, stride), padding, (dilation, dilation), dtype - ) - if add_bias: - C = topi.add(C, bias) - if add_relu: - C = topi.nn.relu(C) - s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C]) - - a = tvm.nd.array(a_np, dev) - w = tvm.nd.array(w_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) - if add_bias: - tvm.build( - s, - [A, W, bias, C], - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func = tvm.build( - s, - [A, W, bias, C], - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func(a, w, b, c) - else: - func = tvm.build( - s, - [A, W, C], - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func(a, w, c) - tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) - - check_target("llvm") - - -def verify_conv2d_NCHWc_int8( - in_dtype, - batch, - in_channel, - in_size, - num_filter, - kernel, - stride, - padding, - dilation=1, - add_bias=False, - add_relu=False, -): - pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) - padding_sum = pad_top + pad_left + pad_bottom + pad_right - print( - "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) - ) - - in_height = in_width = in_size - - A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype) - W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype) - - a_shape = get_const_tuple(A.shape) - w_shape = get_const_tuple(W.shape) - dtype = A.dtype - out_dtype = "int32" if in_dtype == "int8" else "uint32" - lo = -128 if in_dtype == "int8" else 0 - hi = 127 if in_dtype == "int8" else 255 - - def check_target(target, compute, schedule, oc_block_factor, build_only): - dev = tvm.device(target, 0) - if not tvm.testing.device_enabled(target): - print("Skip because %s is not enabled" % target) - return - if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): - print("Skip because int8 intrinsics are not available") - return - - bias = te.placeholder( - (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype + if build_only: + return + + print("Running on target: %s" % target) + + dev = tvm.device(target, 0) + a_np, w_np, b_np, c_np = get_ref_data() + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) + + run_args = [a, w, b, c] if add_bias else [a, w, c] + func(*run_args) + + tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) + + +@pytest.mark.parametrize("in_dtype", ["int8", "uint8"]) +@pytest.mark.parametrize( + "params", + [ + # ResNet18 workloads where channels in / out are multiple of oc_block_factor + (1, 64, 56, 64, 3, 1, 1, 1, False, False), + (1, 64, 56, 64, 1, 1, 0, 1, False, False), + (1, 64, 56, 128, 3, 2, 1, 1, False, False), + (1, 64, 56, 128, 1, 2, 0, 1, False, False), + (1, 128, 28, 128, 3, 1, 1, 1, False, False), + (1, 128, 28, 256, 3, 2, 1, 1, False, False), + (1, 128, 28, 256, 1, 2, 0, 1, False, False), + (1, 256, 14, 256, 3, 1, 1, 1, False, False), + (1, 256, 14, 512, 3, 2, 1, 1, False, False), + (1, 256, 14, 512, 1, 2, 0, 1, False, False), + (1, 512, 7, 512, 3, 1, 1, 1, False, False), + # bias, relu + (1, 64, 56, 64, 3, 1, 1, 1, False, True), + (1, 64, 56, 64, 3, 1, 1, 1, True, False), + (1, 64, 56, 64, 3, 1, 1, 1, True, True), + # dilation = 2 + (1, 64, 56, 64, 3, 1, 1, 2, False, False), + # batch size + (4, 64, 56, 64, 3, 1, 1, 1, False, False), + (9, 64, 56, 64, 3, 1, 1, 1, False, False), + # weird workloads + (4, 4, 4, 8, 4, 4, 4, 1, False, False), + # inception v3 workloads where channels in / out are multiple of oc_block_factor + (1, 32, 149, 32, 3, 1, 0, 1, False, False), + (1, 32, 147, 64, 3, 1, 1, 1, False, False), + (1, 64, 73, 80, 1, 1, 0, 1, False, False), + (1, 80, 73, 192, 3, 1, 0, 1, False, False), + (1, 192, 35, 64, 1, 1, 0, 1, False, False), + (1, 192, 35, 48, 1, 1, 0, 1, False, False), + (1, 48, 35, 64, 5, 1, 2, 1, False, False), + (1, 64, 35, 96, 3, 1, 1, 1, False, False), + (1, 96, 35, 96, 3, 1, 1, 1, False, False), + (1, 192, 35, 32, 1, 1, 0, 1, False, False), + (1, 256, 35, 64, 1, 1, 0, 1, False, False), + (1, 256, 35, 48, 1, 1, 0, 1, False, False), + (1, 288, 35, 64, 1, 1, 0, 1, False, False), + (1, 288, 35, 48, 1, 1, 0, 1, False, False), + (1, 288, 35, 384, 3, 2, 0, 1, False, False), + (1, 96, 35, 96, 3, 2, 0, 1, False, False), + (1, 768, 17, 192, 1, 1, 0, 1, False, False), + (1, 768, 17, 128, 1, 1, 0, 1, False, False), + (1, 128, 17, 128, 1, 1, 0, 1, False, False), + (1, 128, 17, 192, 7, 1, 3, 1, False, False), + (1, 128, 17, 128, 7, 1, 3, 1, False, False), + (1, 128, 17, 192, 1, 1, 0, 1, False, False), + (1, 768, 17, 160, 1, 1, 0, 1, False, False), + (1, 160, 17, 160, 1, 1, 0, 1, False, False), + (1, 160, 17, 192, 7, 1, 3, 1, False, False), + (1, 160, 17, 160, 7, 1, 3, 1, False, False), + (1, 160, 17, 192, 1, 1, 0, 1, False, False), + (1, 192, 17, 192, 1, 1, 0, 1, False, False), + (1, 192, 17, 192, 7, 1, 3, 1, False, False), + (1, 192, 17, 320, 3, 2, 0, 1, False, False), + (1, 192, 17, 192, 3, 2, 0, 1, False, False), + (1, 1280, 8, 320, 1, 1, 0, 1, False, False), + (1, 1280, 8, 384, 1, 1, 0, 1, False, False), + (1, 384, 8, 384, 1, 1, 0, 1, False, False), + (1, 384, 8, 384, 3, 1, 1, 1, False, False), + (1, 1280, 8, 448, 1, 1, 0, 1, False, False), + (1, 448, 8, 384, 3, 1, 1, 1, False, False), + (1, 1280, 8, 192, 1, 1, 0, 1, False, False), + (1, 2048, 8, 320, 1, 1, 0, 1, False, False), + (1, 2048, 8, 384, 1, 1, 0, 1, False, False), + (1, 2048, 8, 448, 1, 1, 0, 1, False, False), + (1, 2048, 8, 192, 1, 1, 0, 1, False, False), + (1, 1024, 19, 88, 3, 1, 1, 1, False, False), + # batch > 1 + (7, 32, 149, 32, 3, 1, 0, 1, False, False), + (8, 32, 149, 32, 3, 1, 0, 1, False, False), + (32, 32, 149, 32, 3, 1, 0, 1, False, False), + # Asymmetric padding + (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False), + (1, 64, 8, 128, 3, 1, (3, 3, 2, 2), 1, False, False), + (1, 64, 8, 64, 1, 1, (1, 2, 2, 1), 1, False, False), + (1, 64, 17, 192, 1, 1, (1, 2), 1, False, False), + (1, 64, 8, 64, 3, 1, (3, 1), 1, False, False), + (1, 128, 8, 384, 3, 1, (0, 2), 1, False, False), + (1, 64, 8, 64, 1, 1, "VALID", 1, False, False), + (1, 392, 8, 64, 3, 1, "VALID", 1, False, False), + (1, 512, 19, 64, 1, 1, "SAME", 1, False, False), + (1, 64, 16, 32, 2, 1, "SAME", 1, False, False), + (1, 64, 8, 64, 3, 1, (1, 2, 2, 1), 1, False, True), + (1, 64, 8, 64, 5, 2, (1, 3), 1, True, False), + (1, 64, 56, 64, 3, 1, "VALID", 1, True, True), + (1, 64, 56, 64, 24, 1, "SAME", 1, True, True), + ], +) +def test_conv2d_NCHWc_int8(in_dtype, params): + with Int8Fallback(): + ( + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding, + dilation, + add_bias, + add_relu, + ) = params + pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) + padding_sum = pad_top + pad_left + pad_bottom + pad_right + print( + "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) ) - bias_shape = get_const_tuple(bias.shape) - @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") - def get_ref_data(): - a_np = np.random.randint(low=lo, high=hi, size=a_shape).astype(out_dtype) - w_np = np.random.randint(low=lo, high=hi, size=w_shape).astype(out_dtype) - b_np = np.random.uniform(size=bias_shape).astype(out_dtype) - dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) - c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype( - out_dtype - ) + in_height = in_width = in_size - # convert to NCHWc - _, _, out_height, out_width = c_np.shape - c_np = c_np.reshape( - (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width) - ).transpose(0, 1, 3, 4, 2) + A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype) + W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype) - if add_bias: - b_np = np.random.uniform(size=bias_shape).astype(out_dtype) - c_np += b_np - if add_relu: - c_np = np.maximum(c_np, 0) + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + dtype = A.dtype + out_dtype = "int32" if in_dtype == "int8" else "uint32" + input_min, input_max = get_dtype_range(in_dtype) - return a_np, w_np, b_np, c_np + def check_target(target, compute, schedule, oc_block_factor, build_only): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + pytest.skip(reason="Skip because %s is not enabled" % target) + if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): + pytest.skip(reason="Skip because %s is not enabled" % target) - a_np, w_np, b_np, c_np = get_ref_data() + bias = te.placeholder( + (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype + ) + bias_shape = get_const_tuple(bias.shape) + + @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NCHWc_int8") + def get_ref_data(): + a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype( + out_dtype + ) + w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype( + out_dtype + ) + b_np = np.random.uniform(size=bias_shape).astype(out_dtype) + dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) + c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype( + out_dtype + ) + + # convert to NCHWc + _, _, out_height, out_width = c_np.shape + c_np = c_np.reshape( + (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width) + ).transpose(0, 1, 3, 4, 2) + + if add_bias: + b_np = np.random.uniform(size=bias_shape).astype(out_dtype) + c_np += b_np + if add_relu: + c_np = np.maximum(c_np, 0) + + return a_np, w_np, b_np, c_np + + with tvm.target.Target(target): + C = compute( + A, + W, + (stride, stride), + padding, + (dilation, dilation), + "NCHW", + "NCHW", + out_dtype, + ) + if add_bias: + C = topi.add(C, bias) + if add_relu: + C = topi.nn.relu(C) + s = schedule([C]) + + compile_args = [A, W, bias, C] if add_bias else [A, W, C] - with tvm.target.Target(target): - C = compute( - A, - W, - (stride, stride), - padding, - (dilation, dilation), - "NCHW", - "NCHW", - out_dtype, + func = tvm.build( + s, + compile_args, + target, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) - if add_bias: - C = topi.add(C, bias) - if add_relu: - C = topi.nn.relu(C) - s = schedule([C]) - a = tvm.nd.array(a_np.astype(dtype), dev) - w = tvm.nd.array(w_np.astype(dtype), dev) - b = tvm.nd.array(b_np.astype(out_dtype), dev) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) - - if add_bias: - compile_args = [A, W, bias, C] - run_args = [a, w, b, c] - else: - compile_args = [A, W, C] - run_args = [a, w, c] - - func = tvm.build( - s, - compile_args, - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) + if build_only: + return - if build_only: - return + a_np, w_np, b_np, c_np = get_ref_data() - print("Running on target: %s" % target) + a = tvm.nd.array(a_np.astype(dtype), dev) + w = tvm.nd.array(w_np.astype(dtype), dev) + b = tvm.nd.array(b_np.astype(out_dtype), dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) + run_args = [a, w, b, c] if add_bias else [a, w, c] - func(*run_args) + print("Running on target: %s" % target) - tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) + func(*run_args) - targets = [ - ( - "cuda", - lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o), - topi.cuda.schedule_conv2d_NCHWc_int8, - 4, - False, - ), - # Disable on CI since it does not support spirv int8 dot product - # ( - # "vulkan -from_device=0", - # lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o), - # topi.cuda.schedule_conv2d_NCHWc_int8, - # 4, - # False, - # ), - ] - - build_only_aarch64 = platform.machine() != "aarch64" - - targets.append( - ( - "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod", - topi.arm_cpu.conv2d_NCHWc_int8, - topi.arm_cpu.schedule_conv2d_NCHWc_int8, - 8, - build_only_aarch64, - ) - ) + tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) - if in_dtype == "int8": - targets += [ - ( - "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon", - topi.arm_cpu.conv2d_NCHWc_int8, - topi.arm_cpu.schedule_conv2d_NCHWc_int8, - 8, - build_only_aarch64, - ), + targets = [ ( - "rocm -mattr=+dotprod", + "cuda", lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o), topi.cuda.schedule_conv2d_NCHWc_int8, 4, False, ), + # Disable on CI since it does not support spirv int8 dot product + # ( + # "vulkan -from_device=0", + # lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o), + # topi.cuda.schedule_conv2d_NCHWc_int8, + # 4, + # False, + # ), ] - for target, compute, schedule, oc_block_factor, build_only in targets: - check_target(target, compute, schedule, oc_block_factor, build_only) - - -def verify_conv2d_nchw_int8( - in_dtype, - batch, - in_channel, - in_size, - num_filter, - kernel, - stride, - padding, - dilation=1, - add_bias=False, - add_relu=False, -): - pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) - padding_sum = pad_top + pad_left + pad_bottom + pad_right - print( - "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) - ) - - in_height = in_width = in_size - - A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype) - W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype) - bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype) - - a_shape = get_const_tuple(A.shape) - w_shape = get_const_tuple(W.shape) - bias_shape = get_const_tuple(bias.shape) - dtype = A.dtype - - @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") - def get_ref_data(): - a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) - w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) - b_np = np.random.uniform(size=bias_shape).astype(dtype) - dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) - c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) - - if add_bias: - b_np = np.random.uniform(size=bias_shape).astype(dtype) - c_np += b_np - if add_relu: - c_np = np.maximum(c_np, 0) - - return a_np, w_np, b_np, c_np - - a_np, w_np, b_np, c_np = get_ref_data() - - def verify_workload_padding(): - _, _, out_height, out_width = get_const_tuple(c_np.shape) - wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) - - # for testing functionality, - # we choose arbitrary int32_lanes and num_int8_elements can divide the channel, - # regardless of the performance. - int32_lanes, num_int8_elements = num_filter, in_channel - - # check if tile_ow candidates are the factors of the right output weight. - cfg = autotvm.get_config() - fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements) - ow_tile = np.prod(cfg["tile_ow"].size) - - tvm.testing.assert_allclose(ow_tile, out_width) - - def check_target(target): - dev = tvm.device(target, 0) - if not tvm.testing.device_enabled(target): - print("Skip because %s is not enabled" % target) - return - if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): - print("Skip because int8 intrinsics are not available") - return - - print("Running on target: %s" % target) - with tvm.target.Target(target): - C = topi.cuda.conv2d_nchw_int8( - A, W, (stride, stride), padding, (dilation, dilation), dtype - ) - if add_bias: - C = topi.add(C, bias) - if add_relu: - C = topi.nn.relu(C) - s = topi.cuda.schedule_conv2d_nchw_int8([C]) - - a = tvm.nd.array(a_np, dev) - w = tvm.nd.array(w_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) - if add_bias: - tvm.build( - s, - [A, W, bias, C], - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func = tvm.build( - s, - [A, W, bias, C], - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func(a, w, b, c) - else: - func = tvm.build( - s, - [A, W, C], - target, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + build_only_aarch64 = platform.machine() != "aarch64" + + targets.append( + ( + "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod", + topi.arm_cpu.conv2d_NCHWc_int8, + topi.arm_cpu.schedule_conv2d_NCHWc_int8, + 8, + build_only_aarch64, ) - func(a, w, c) - tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) + ) - verify_workload_padding() + if in_dtype == "int8": + targets += [ + ( + "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon", + topi.arm_cpu.conv2d_NCHWc_int8, + topi.arm_cpu.schedule_conv2d_NCHWc_int8, + 8, + build_only_aarch64, + ), + ( + "rocm -mattr=+dotprod", + lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8( + a, w, s, p, d, l, o + ), + topi.cuda.schedule_conv2d_NCHWc_int8, + 4, + False, + ), + ] + + for target, compute, schedule, oc_block_factor, build_only in targets: + check_target(target, compute, schedule, oc_block_factor, build_only) + + +# Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So, just +# performing basic testing - one test for all different scenarios - batch, dilation etc.. +@pytest.mark.parametrize("in_dtype", ["int8", "uint8"]) +@pytest.mark.parametrize( + "params", + [ + (1, 64, 56, 64, 3, 1, 1, 1, False, False), + (1, 64, 56, 64, 3, 1, 1, 1, False, True), + (1, 64, 56, 64, 3, 1, 1, 2, False, False), + (9, 64, 56, 64, 3, 1, 1, 1, False, False), + (4, 4, 4, 4, 4, 4, 4, 1, False, False), + (1, 32, 149, 32, 3, 1, 0, 1, False, False), + (7, 32, 149, 32, 3, 1, 0, 1, False, False), + (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False), + (1, 32, 35, 64, 7, 2, (0, 0, 2, 2), 1, False, False), + ], +) +def test_conv2d_nchw_int8(in_dtype, params): + with Int8Fallback(): + ( + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding, + dilation, + add_bias, + add_relu, + ) = params + pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) + padding_sum = pad_top + pad_left + pad_bottom + pad_right + print( + "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) + ) - for target in ["cuda"]: - check_target(target) + in_height = in_width = in_size + A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype) + W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype) + bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype) -@pytest.mark.parametrize("in_dtype", ["int8", "uint8"]) -def test_conv2d_nchw(in_dtype): - with Int8Fallback(): - # ResNet18 workloads where channels in / out are multiple of oc_block_factor - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 3, 2, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 1, 2, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 128, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 3, 2, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 1, 2, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 256, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 3, 2, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 1, 2, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 7, 512, 3, 1, 1) + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + bias_shape = get_const_tuple(bias.shape) + dtype = A.dtype - # bias, relu - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_relu=True) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_bias=True) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True) + @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_nchw_int8") + def get_ref_data(): + a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) + w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) + b_np = np.random.uniform(size=bias_shape).astype(dtype) + dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) + c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) - # dilation = 2 - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2) + if add_bias: + b_np = np.random.uniform(size=bias_shape).astype(dtype) + c_np += b_np + if add_relu: + c_np = np.maximum(c_np, 0) - # batch size - verify_conv2d_NCHWc_int8(in_dtype, 4, 64, 56, 64, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1) + return a_np, w_np, b_np, c_np - # weird workloads - verify_conv2d_NCHWc_int8(in_dtype, 4, 4, 4, 8, 4, 4, 4) + a_np, w_np, b_np, c_np = get_ref_data() - # inception v3 workloads where channels in / out are multiple of oc_block_factor - verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 147, 64, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 73, 80, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 80, 73, 192, 3, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 64, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 48, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 48, 35, 64, 5, 1, 2) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 35, 96, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 32, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 64, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 48, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 64, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 48, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 384, 3, 2, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 2, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 192, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 128, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 7, 1, 3) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 7, 1, 3) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 160, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 7, 1, 3) - verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 7, 1, 3) - verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 7, 1, 3) - verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 320, 3, 2, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 3, 2, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 320, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 384, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 448, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 448, 8, 384, 3, 1, 1) - verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 192, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 320, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 384, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 448, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 192, 1, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 1, 1024, 19, 88, 3, 1, 1) + def verify_workload_padding(): + _, _, _, out_width = get_const_tuple(c_np.shape) + wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) + + # for testing functionality, + # we choose arbitrary int32_lanes and num_int8_elements can divide the channel, + # regardless of the performance. + int32_lanes, num_int8_elements = num_filter, in_channel + + # check if tile_ow candidates are the factors of the right output weight. + cfg = autotvm.get_config() + fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + pytest.skip("Skip because %s is not enabled" % target) + if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): + pytest.skip("Skip because int8 intrinsics are not available") + + print("Running on target: %s" % target) + with tvm.target.Target(target): + C = topi.cuda.conv2d_nchw_int8( + A, W, (stride, stride), padding, (dilation, dilation), dtype + ) + if add_bias: + C = topi.add(C, bias) + if add_relu: + C = topi.nn.relu(C) + s = topi.cuda.schedule_conv2d_nchw_int8([C]) + + build_args = [A, W, bias, C] if add_bias else [A, W, C] - # batch > 1 - verify_conv2d_NCHWc_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 8, 32, 149, 32, 3, 1, 0) - verify_conv2d_NCHWc_int8(in_dtype, 32, 32, 149, 32, 3, 1, 0) + func = tvm.build( + s, + build_args, + target, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % ( + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding_sum, + dilation, + ), + ) - # Asymmetric padding - verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1)) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 128, 3, 1, (3, 3, 2, 2)) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, (1, 2, 2, 1)) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 17, 192, 1, 1, (1, 2)) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (3, 1)) - verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 8, 384, 3, 1, (0, 2)) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, "VALID") - verify_conv2d_NCHWc_int8(in_dtype, 1, 392, 8, 64, 3, 1, "VALID") - verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 19, 64, 1, 1, "SAME") - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 16, 32, 2, 1, "SAME") - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (1, 2, 2, 1), add_relu=True) - verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 5, 2, (1, 3), add_bias=True) - verify_conv2d_NCHWc_int8( - in_dtype, 1, 64, 56, 64, 3, 1, "VALID", add_bias=True, add_relu=True - ) - verify_conv2d_NCHWc_int8( - in_dtype, 1, 64, 56, 64, 24, 1, "SAME", add_bias=True, add_relu=True - ) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) - # Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So, just - # performing basic testing - one test for all different scenarios - batch, dilation etc.. - verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1) - verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_relu=True) - verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2) - verify_conv2d_nchw_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1) - verify_conv2d_nchw_int8(in_dtype, 4, 4, 4, 4, 4, 4, 4) - verify_conv2d_nchw_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0) - verify_conv2d_nchw_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0) - verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1)) - verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 2, 2)) + run_args = [a, w, b, c] if add_bias else [a, w, c] + func(*run_args) -def test_conv2d_nhwc(): - with Int8Fallback(): - # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID' padding) - verify_conv2d_NHWC_gemm_int8(1, 3, 299, 32, 3, 2, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 32, 149, 32, 3, 1, "SAME", dilation=2) - verify_conv2d_NHWC_gemm_int8(4, 32, 147, 64, 3, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 64, 73, 80, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 80, 73, 192, 3, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 192, 35, 48, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 192, 35, 64, 1, 1, "VALID") - verify_conv2d_NHWC_gemm_int8(1, 192, 35, 32, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 48, 35, 64, 5, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 256, 35, 48, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 256, 35, 64, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 288, 35, 64, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 288, 35, 48, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 2, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 128, 17, 192, 7, 1, "SAME", dilation=2) - verify_conv2d_NHWC_gemm_int8(1, 160, 17, 160, 7, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 160, 17, 192, 1, 1, "VALID") - verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 768, 5, 128, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 192, 17, 320, 3, 2, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 3, 2, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 192, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 384, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 320, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 448, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 3, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 448, 8, 384, 3, 1, "VALID") - verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 320, 1, 1, "SAME") - verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 448, 1, 1, "SAME", add_bias=True, add_relu=True) - verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 192, 1, 1, "SAME", add_bias=True) - - # Let's also verify that it compiles fine on AArch64 targets - compile_conv2d_NHWC_gemm_int8_arm(1, 3, 299, 32, 3, 2, "SAME") + tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) + + verify_workload_padding() + + check_target("cuda") if __name__ == "__main__":