From 23e5505bdb2f1dd229069aff2c35c152e571aee1 Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Mon, 21 Dec 2020 21:01:28 +0000 Subject: [PATCH 01/13] added asymmetric padding to conv2d workload --- python/tvm/topi/generic/conv2d.py | 10 +++++----- python/tvm/topi/nn/conv2d.py | 10 ++++++---- python/tvm/topi/x86/conv2d_avx_1x1.py | 6 +++--- python/tvm/topi/x86/conv2d_avx_common.py | 8 ++++---- python/tvm/topi/x86/depthwise_conv2d.py | 4 ++-- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py index 7dd9aed7545d..0f5021bf9b2c 100644 --- a/python/tvm/topi/generic/conv2d.py +++ b/python/tvm/topi/generic/conv2d.py @@ -38,9 +38,9 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements): How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ - HPAD, WPAD = wkl.hpad, wkl.wpad + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, @@ -85,10 +85,10 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements): How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ - HPAD, WPAD = wkl.hpad, wkl.wpad + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1 + out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 8d591a20839a..107e0f4dacca 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -40,8 +40,10 @@ "out_filter", "hkernel", "wkernel", - "hpad", - "wpad", + "padt", + "padb", + "padl", + "padr", "hstride", "wstride", ], @@ -170,7 +172,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): else: KH, KW, CIG, CO = get_const_tuple(kernel.shape) - HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) + pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride @@ -182,7 +184,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): '{} vs. {}".format( data.dtype, kernel.dtype ) - return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) + return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, pt, pl, pb, pr, HSTR, WSTR) def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py index 3e5a12bc43b2..70c12f1aad2f 100644 --- a/python/tvm/topi/x86/conv2d_avx_1x1.py +++ b/python/tvm/topi/x86/conv2d_avx_1x1.py @@ -31,10 +31,10 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1 + out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py index 8d707445be05..75c466242498 100644 --- a/python/tvm/topi/x86/conv2d_avx_common.py +++ b/python/tvm/topi/x86/conv2d_avx_common.py @@ -27,9 +27,9 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): @@ -56,9 +56,9 @@ def _fallback_schedule(cfg, wkl): def _fallback_schedule_int8(cfg, wkl): - HPAD, WPAD = wkl.hpad, wkl.wpad + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 oc_bn = 16 assert wkl.out_filter % oc_bn == 0 diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py index badba1a248e9..aec0a894a968 100644 --- a/python/tvm/topi/x86/depthwise_conv2d.py +++ b/python/tvm/topi/x86/depthwise_conv2d.py @@ -42,9 +42,9 @@ def _fallback_schedule(cfg, wkl): """ simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): From f5744524331f35d29469932b07daa29af25c318c Mon Sep 17 00:00:00 2001 From: Perry Gibson Date: Tue, 22 Dec 2020 14:50:27 +0000 Subject: [PATCH 02/13] fixed depthwise conv2d padding --- python/tvm/topi/nn/depthwise_conv2d.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py index 72356821770d..2f56ae4d0c66 100644 --- a/python/tvm/topi/nn/depthwise_conv2d.py +++ b/python/tvm/topi/nn/depthwise_conv2d.py @@ -38,8 +38,10 @@ "out_filter", "hkernel", "wkernel", - "hpad", - "wpad", + "padt", + "padb", + "padl", + "padr", "hstride", "wstride", ], @@ -51,7 +53,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype): _, in_channel, height, width = [x.value for x in data.shape] channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape] out_channel = channel * channel_multiplier - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + pt, pl, pb, pr = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: @@ -71,8 +73,10 @@ def _get_workload(data, kernel, stride, padding, out_dtype): out_channel, kh, kw, - HPAD, - WPAD, + pt, + pl, + pb, + pr HSTR, WSTR, ) From 81ec6c690b759c48599334009aba189b305a25da Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Wed, 23 Dec 2020 20:34:17 +0000 Subject: [PATCH 03/13] Added fix to include dilation in workload output width calculation --- python/tvm/topi/cuda/conv2d_int8.py | 7 ++--- python/tvm/topi/generic/conv2d.py | 3 ++- python/tvm/topi/nn/conv2d.py | 27 ++++++++++++++++--- python/tvm/topi/nn/depthwise_conv2d.py | 12 ++++++--- python/tvm/topi/x86/conv2d.py | 6 +++-- python/tvm/topi/x86/conv2d_avx_1x1.py | 8 ++++-- python/tvm/topi/x86/conv2d_avx_common.py | 5 +++- python/tvm/topi/x86/conv2d_int8.py | 12 ++++++--- python/tvm/topi/x86/depthwise_conv2d.py | 5 +++- .../topi/python/test_topi_conv2d_int8.py | 17 ++++++++++++ 10 files changed, 82 insertions(+), 20 deletions(-) diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py index 50a0e8b71661..001411d6e4c9 100644 --- a/python/tvm/topi/cuda/conv2d_int8.py +++ b/python/tvm/topi/cuda/conv2d_int8.py @@ -142,9 +142,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_ pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") # compute the output shape - out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 - out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 - + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1 + out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1 oshape = (batch, oc_chunk, out_height, out_width, oc_block) icc = te.reduce_axis((0, ic_chunk), name="ic_chunk") diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py index 0f5021bf9b2c..94e863276fb8 100644 --- a/python/tvm/topi/generic/conv2d.py +++ b/python/tvm/topi/generic/conv2d.py @@ -40,7 +40,8 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements): """ pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 + dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 4fc4b54530b3..46bb05389fe0 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -41,9 +41,11 @@ "hkernel", "wkernel", "padt", - "padb", "padl", + "padb", "padr", + "hdilation", + "wdilation", "hstride", "wstride", ], @@ -156,7 +158,7 @@ def conv2d_infer_layout(workload, cfg): raise ValueError("missing register for topi.nn.conv2d_infer_layout") -def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): +def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layout="NCHW"): """ Get the workload structure. """ if data_layout == "NCHW": _, CI, IH, IW = get_const_tuple(data.shape) @@ -173,6 +175,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): KH, KW, CIG, CO = get_const_tuple(kernel.shape) pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) + hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride @@ -184,7 +187,25 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): '{} vs. {}".format( data.dtype, kernel.dtype ) - return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, pt, pl, pb, pr, HSTR, WSTR) + return Workload( + data.dtype, + out_dtype, + IH, + IW, + CI, + GRPS, + CO, + KH, + KW, + pt, + pl, + pb, + pr, + hdilation, + wdilation, + HSTR, + WSTR, + ) def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py index 2f56ae4d0c66..b13768d261ca 100644 --- a/python/tvm/topi/nn/depthwise_conv2d.py +++ b/python/tvm/topi/nn/depthwise_conv2d.py @@ -39,21 +39,25 @@ "hkernel", "wkernel", "padt", - "padb", "padl", + "padb", "padr", + "hdilation", + "wdilation", "hstride", "wstride", ], ) -def _get_workload(data, kernel, stride, padding, out_dtype): +def _get_workload(data, kernel, stride, padding, dilation, out_dtype): """ Get the workload structure. """ _, in_channel, height, width = [x.value for x in data.shape] channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape] out_channel = channel * channel_multiplier pt, pl, pb, pr = get_pad_tuple(padding, kernel) + hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: @@ -76,7 +80,9 @@ def _get_workload(data, kernel, stride, padding, out_dtype): pt, pl, pb, - pr + pr, + hdilation, + wdilation, HSTR, WSTR, ) diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index a3b7e473415e..b5d037466902 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -69,8 +69,10 @@ def _conv2d_infer_layout(workload, cfg): idxdiv = tvm.tir.indexdiv pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) - out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1 - out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1 + dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1 + dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 + out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1 + out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1 tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic) in_layout = "NCHW%dc" % tile_ic diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py index 70c12f1aad2f..0a5b654bb4d2 100644 --- a/python/tvm/topi/x86/conv2d_avx_1x1.py +++ b/python/tvm/topi/x86/conv2d_avx_1x1.py @@ -33,8 +33,12 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 + dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_h = (wkl.hkernel - 1) * dh + 1 + dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + + out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1 + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py index 75c466242498..0f8bcfc5ac9c 100644 --- a/python/tvm/topi/x86/conv2d_avx_common.py +++ b/python/tvm/topi/x86/conv2d_avx_common.py @@ -29,7 +29,10 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 + _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py index 905ada68f277..68c1be7b21cf 100644 --- a/python/tvm/topi/x86/conv2d_int8.py +++ b/python/tvm/topi/x86/conv2d_int8.py @@ -33,7 +33,7 @@ def _get_default_config_int8( - cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW" ): """ Get default schedule config for the workload @@ -45,7 +45,7 @@ def _get_default_config_int8( _fallback_schedule(cfg, wkl) else: - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8( @@ -138,8 +138,11 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out is_kernel_1x1 = kernel_height == 1 and kernel_width == 1 pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width)) sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) - oh = (ih - kernel_height + pt + pb) // sh + 1 - ow = (iw - kernel_width + pl + pr) // sw + 1 + dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_h = (wkl.hkernel - 1) * dh + 1 + dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + oh = (ih - dilated_kernel_h + pt + pb) // sh + 1 + ow = (iw - dilated_kernel_w + pl + pr) // sw + 1 cfg.define_split("tile_ic", in_channel, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0) cfg.define_split("tile_oc", num_filter, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0) @@ -159,6 +162,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out ), strides, padding, + dilation, out_dtype, ) diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py index aec0a894a968..fa0edaa934d4 100644 --- a/python/tvm/topi/x86/depthwise_conv2d.py +++ b/python/tvm/topi/x86/depthwise_conv2d.py @@ -44,7 +44,10 @@ def _fallback_schedule(cfg, wkl): pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 + _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index 1bf83eba53ac..ce96f83244d4 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -27,6 +27,8 @@ from tvm.topi.nn.utils import get_pad_tuple from tvm.topi.utils import get_const_tuple from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm +from tvm.topi.nn.conv2d import _get_workload +from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8 from common import Int8Fallback import tvm.testing @@ -385,6 +387,18 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() + def verify_fallback_schedule_cpu_padding(): + _, _, out_height, out_width = get_const_tuple(c_np.shape) + wkl = _get_workload(data, kernel, (stride, stride), padding, dilation, dtype) + int32_lanes, num_int8_elements = num_filter, in_channel + + # check if tile_ow candidates are the factors of the right output weight. + cfg = autotvm.get_config() + fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): @@ -436,6 +450,8 @@ def check_device(device): func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + verify_fallback_schedule_cpu_padding() + for device in ["cuda"]: check_device(device) @@ -547,6 +563,7 @@ def test_conv2d_nchw(): verify_conv2d_nchw_int8(1, 32, 149, 32, 3, 1, 0) verify_conv2d_nchw_int8(7, 32, 149, 32, 3, 1, 0) verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 1, 1)) + verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 2, 2)) def test_conv2d_nhwc(): From d72c06d2de14e732a7878bd5600aa1095bbb4855 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Wed, 23 Dec 2020 20:53:20 +0000 Subject: [PATCH 04/13] Added missing dilation to arm_cpu/conv2d_int8.py workload --- python/tvm/topi/arm_cpu/conv2d_int8.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 445b9ec0c113..1977c508d72e 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -32,11 +32,11 @@ from .arm_utils import get_tiling_B_interleaved_t -def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): +def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype): """ Get default int8 schedule config for the workload """ - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype) + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype) is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=2, num_int8_elements=4) @@ -65,6 +65,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out te.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype), strides, padding, + dilation, out_dtype, ) return nn.conv2d_NCHWc_int8_compute( From eea4f58060692a71b5213074e796fe88caf26033 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Wed, 23 Dec 2020 21:35:54 +0000 Subject: [PATCH 05/13] Fixed dilation for x86 conv2d --- python/tvm/topi/x86/conv2d.py | 7 ++++--- python/tvm/topi/x86/conv2d_avx_common.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index b5d037466902..a751b1362f00 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -35,7 +35,7 @@ def _get_default_config( - cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW" ): """ Get default schedule config for the workload @@ -48,12 +48,12 @@ def _get_default_config( static_data_shape.append(dim) data = te.placeholder(static_data_shape, dtype=data.dtype) if is_depthwise: - wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype) + wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype) from .depthwise_conv2d import _fallback_schedule _fallback_schedule(cfg, wkl) else: - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 if is_kernel_1x1: conv2d_avx_1x1._fallback_schedule(cfg, wkl) @@ -210,6 +210,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo ), strides, padding, + dilation, out_dtype, ) diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py index 0f8bcfc5ac9c..625e85d561c9 100644 --- a/python/tvm/topi/x86/conv2d_avx_common.py +++ b/python/tvm/topi/x86/conv2d_avx_common.py @@ -29,8 +29,7 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) - dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 From e7296ebde25a4cb478bb335f564e314a6db717d8 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Wed, 23 Dec 2020 22:04:57 +0000 Subject: [PATCH 06/13] Improved dilation workload integration in x86 --- python/tvm/topi/x86/conv2d.py | 4 ++-- python/tvm/topi/x86/conv2d_avx_1x1.py | 5 ++--- python/tvm/topi/x86/depthwise_conv2d.py | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index a751b1362f00..9f2b9e1bd197 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -69,8 +69,8 @@ def _conv2d_infer_layout(workload, cfg): idxdiv = tvm.tir.indexdiv pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) - dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1 - dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 + dilated_kernel_h = (workload.hkernel - 1) * workload.hdilation + 1 + dilated_kernel_w = (workload.wkernel - 1) * workload.wdilation + 1 out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1 out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1 tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py index 0a5b654bb4d2..0b23d3ccc462 100644 --- a/python/tvm/topi/x86/conv2d_avx_1x1.py +++ b/python/tvm/topi/x86/conv2d_avx_1x1.py @@ -33,9 +33,8 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) - dilated_kernel_h = (wkl.hkernel - 1) * dh + 1 - dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1 + dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py index fa0edaa934d4..188e199f4ba1 100644 --- a/python/tvm/topi/x86/depthwise_conv2d.py +++ b/python/tvm/topi/x86/depthwise_conv2d.py @@ -44,8 +44,7 @@ def _fallback_schedule(cfg, wkl): pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr HSTR, WSTR = wkl.hstride, wkl.wstride - _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) - dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 @@ -168,6 +167,7 @@ def depthwise_conv2d_NCHWc( ), strides, (pad_top, pad_down), + dilation, out_dtype, ) if cfg.is_fallback: From 3e5ca07f8acb8c852eedc810e8435c75c5f6a8a8 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Thu, 24 Dec 2020 11:52:47 +0000 Subject: [PATCH 07/13] Fixed x86 conv2d_alter_op to add dilation --- python/tvm/topi/x86/conv2d_alter_op.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py index db3c232b6a7f..0426f383ae33 100644 --- a/python/tvm/topi/x86/conv2d_alter_op.py +++ b/python/tvm/topi/x86/conv2d_alter_op.py @@ -73,7 +73,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout + cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) @@ -118,7 +118,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): assert data_layout == "NCHW" and kernel_layout == "OIHW" if cfg.is_fallback: _get_default_config_int8( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout + cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) @@ -174,7 +174,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout + cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, True, data_layout ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) From 9f7341a1bc12cc1173a05fee622ea0706e47645a Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Thu, 24 Dec 2020 12:45:19 +0000 Subject: [PATCH 08/13] Local linting not always producing same output as CI, probably my fault --- python/tvm/topi/x86/conv2d_alter_op.py | 30 +++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py index 0426f383ae33..11f3b7dc67a7 100644 --- a/python/tvm/topi/x86/conv2d_alter_op.py +++ b/python/tvm/topi/x86/conv2d_alter_op.py @@ -73,7 +73,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + False, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) @@ -118,7 +126,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): assert data_layout == "NCHW" and kernel_layout == "OIHW" if cfg.is_fallback: _get_default_config_int8( - cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + False, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) @@ -174,7 +190,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, True, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + True, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) From 4fe405d5a8ca6d57445005c0bfeb0c463b1b3798 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Thu, 24 Dec 2020 15:33:57 +0000 Subject: [PATCH 09/13] Fixed bug, tested locally --- python/tvm/topi/x86/conv2d.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index 9f2b9e1bd197..82f94d6cf8cc 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -69,8 +69,9 @@ def _conv2d_infer_layout(workload, cfg): idxdiv = tvm.tir.indexdiv pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) - dilated_kernel_h = (workload.hkernel - 1) * workload.hdilation + 1 - dilated_kernel_w = (workload.wkernel - 1) * workload.wdilation + 1 + hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_h = (k_height - 1) * hdilation + 1 + dilated_kernel_w = (k_width - 1) * wdilation + 1 out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1 out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1 tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] From 297bb358ee30118eb1dddbbd0ec7945656659184 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Fri, 25 Dec 2020 13:05:50 +0000 Subject: [PATCH 10/13] Abusing CI until I can figure out how to reproduce the same behaviour of running integration tests locally. --- python/tvm/topi/x86/conv2d_int8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py index 68c1be7b21cf..692d99e9b675 100644 --- a/python/tvm/topi/x86/conv2d_int8.py +++ b/python/tvm/topi/x86/conv2d_int8.py @@ -139,8 +139,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width)) sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) - dilated_kernel_h = (wkl.hkernel - 1) * dh + 1 - dilated_kernel_w = (wkl.wkernel - 1) * dw + 1 + dilated_kernel_h = (kernel_height - 1) * dh + 1 + dilated_kernel_w = (kernel_width - 1) * dw + 1 oh = (ih - dilated_kernel_h + pt + pb) // sh + 1 ow = (iw - dilated_kernel_w + pl + pr) // sw + 1 From c221db20a0dfe036e2de800e4f75ba278bb482e9 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Sat, 26 Dec 2020 15:57:35 +0000 Subject: [PATCH 11/13] Ammeded conv2d_int8 test --- tests/python/topi/python/test_topi_conv2d_int8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index ce96f83244d4..2870ecb8442f 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -389,7 +389,7 @@ def get_ref_data(): def verify_fallback_schedule_cpu_padding(): _, _, out_height, out_width = get_const_tuple(c_np.shape) - wkl = _get_workload(data, kernel, (stride, stride), padding, dilation, dtype) + wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) int32_lanes, num_int8_elements = num_filter, in_channel # check if tile_ow candidates are the factors of the right output weight. From ea0acb282b373441a37880a1a7675432886399f8 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Tue, 29 Dec 2020 15:25:02 +0000 Subject: [PATCH 12/13] Updated workload, improved unit tests --- python/tvm/topi/generic/conv2d.py | 4 ++-- python/tvm/topi/nn/conv2d.py | 20 ++++++++++--------- python/tvm/topi/nn/depthwise_conv2d.py | 20 ++++++++++--------- python/tvm/topi/x86/conv2d.py | 2 +- python/tvm/topi/x86/conv2d_avx_1x1.py | 6 +++--- python/tvm/topi/x86/conv2d_avx_common.py | 4 ++-- python/tvm/topi/x86/depthwise_conv2d.py | 4 ++-- .../topi/python/test_topi_conv2d_int8.py | 10 +++++++--- .../topi/python/test_topi_conv2d_nchw.py | 17 ++++++++++++++++ 9 files changed, 56 insertions(+), 31 deletions(-) diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py index 94e863276fb8..60f5bcf30e49 100644 --- a/python/tvm/topi/generic/conv2d.py +++ b/python/tvm/topi/generic/conv2d.py @@ -39,8 +39,8 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements): This is related to input channel. """ pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr - HSTR, WSTR = wkl.hstride, wkl.wstride - dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 46bb05389fe0..7f6116d696c0 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -38,16 +38,16 @@ "in_filter", "groups", "out_filter", - "hkernel", - "wkernel", + "kernel_h", + "kernel_w", "padt", "padl", "padb", "padr", - "hdilation", - "wdilation", - "hstride", - "wstride", + "dilation_h", + "dilation_w", + "stride_h", + "stride_w", ], ) @@ -175,7 +175,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou KH, KW, CIG, CO = get_const_tuple(kernel.shape) pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) - hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilation_h, dilation_w = ( + dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + ) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride @@ -201,8 +203,8 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou pl, pb, pr, - hdilation, - wdilation, + dilation_h, + dilation_w, HSTR, WSTR, ) diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py index b13768d261ca..1b9f52015716 100644 --- a/python/tvm/topi/nn/depthwise_conv2d.py +++ b/python/tvm/topi/nn/depthwise_conv2d.py @@ -36,16 +36,16 @@ "width", "in_filter", "out_filter", - "hkernel", - "wkernel", + "kernel_h", + "kernel_w", "padt", "padl", "padb", "padr", - "hdilation", - "wdilation", - "hstride", - "wstride", + "dilation_h", + "dilation_w", + "stride_h", + "stride_w", ], ) @@ -56,7 +56,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype): channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape] out_channel = channel * channel_multiplier pt, pl, pb, pr = get_pad_tuple(padding, kernel) - hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilation_h, dilation_w = ( + dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + ) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride @@ -81,8 +83,8 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype): pl, pb, pr, - hdilation, - wdilation, + dilation_h, + dilation_w, HSTR, WSTR, ) diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index 82f94d6cf8cc..182454acf3a6 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -54,7 +54,7 @@ def _get_default_config( _fallback_schedule(cfg, wkl) else: wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_avx_1x1._fallback_schedule(cfg, wkl) else: diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py index 0b23d3ccc462..afee03a9f6a0 100644 --- a/python/tvm/topi/x86/conv2d_avx_1x1.py +++ b/python/tvm/topi/x86/conv2d_avx_1x1.py @@ -32,9 +32,9 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr - HSTR, WSTR = wkl.hstride, wkl.wstride - dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1 - dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1 + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py index 625e85d561c9..fb58bd350177 100644 --- a/python/tvm/topi/x86/conv2d_avx_common.py +++ b/python/tvm/topi/x86/conv2d_avx_common.py @@ -28,8 +28,8 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr - HSTR, WSTR = wkl.hstride, wkl.wstride - dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py index 188e199f4ba1..d05013bc2d27 100644 --- a/python/tvm/topi/x86/depthwise_conv2d.py +++ b/python/tvm/topi/x86/depthwise_conv2d.py @@ -43,8 +43,8 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr - HSTR, WSTR = wkl.hstride, wkl.wstride - dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1 + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index 2870ecb8442f..a934e3ef2fd2 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -114,7 +114,7 @@ def compile_conv2d_NHWC_gemm_int8_arm( s, [A, W, bias, C], device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) else: @@ -387,9 +387,13 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def verify_fallback_schedule_cpu_padding(): + def verify_workload_padding(): _, _, out_height, out_width = get_const_tuple(c_np.shape) wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) + + # for testing functionality, + # we choose arbitrary int32_lanes and num_int8_elements can divide the channel, + # regardless of the performance. int32_lanes, num_int8_elements = num_filter, in_channel # check if tile_ow candidates are the factors of the right output weight. @@ -450,7 +454,7 @@ def check_device(device): func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - verify_fallback_schedule_cpu_padding() + verify_workload_padding() for device in ["cuda"]: check_device(device) diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py index 1b7575211dac..07ad45c971df 100644 --- a/tests/python/topi/python/test_topi_conv2d_nchw.py +++ b/tests/python/topi/python/test_topi_conv2d_nchw.py @@ -25,6 +25,8 @@ from tvm.contrib.pickle_memoize import memoize from tvm.topi.nn.utils import get_pad_tuple from tvm.topi.utils import get_const_tuple +from tvm.topi.nn.conv2d import _get_workload +from tvm.topi.x86.conv2d_avx_common import _fallback_schedule import tvm.testing @@ -76,6 +78,17 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(c_np.shape) + wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) + + # check if tile_ow candidates are the factors of the right output weight. + cfg = autotvm.get_config() + _fallback_schedule(cfg, wkl) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): @@ -101,6 +114,9 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) + if "llvm" in device: + verify_workload_padding() + a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) @@ -242,6 +258,7 @@ def test_conv2d_nchw(): verify_conv2d_nchw(1, 64, 8, 64, 5, 2, (1, 3), add_bias=True) verify_conv2d_nchw(1, 64, 8, 64, 3, 1, "VALID", add_bias=True, add_relu=True) verify_conv2d_nchw(1, 64, 8, 64, 24, 1, "SAME", add_bias=True, add_relu=True) + verify_conv2d_nchw(1, 32, 35, 64, 7, 2, (0, 0, 2, 2)) if __name__ == "__main__": From 8e24562e9add83db840cf8a85dee4e1dcd7040c3 Mon Sep 17 00:00:00 2001 From: "Perry Gibson (gabriel)" Date: Tue, 29 Dec 2020 16:50:23 +0000 Subject: [PATCH 13/13] Added depthwise conv2d workload test --- python/tvm/topi/arm_cpu/conv2d_int8.py | 2 +- python/tvm/topi/generic/conv2d.py | 6 ++--- python/tvm/topi/nn/depthwise_conv2d.py | 5 ++-- .../topi/testing/depthwise_conv2d_python.py | 2 +- python/tvm/topi/x86/conv2d_avx_common.py | 4 ++-- python/tvm/topi/x86/conv2d_int8.py | 2 +- python/tvm/topi/x86/depthwise_conv2d.py | 2 +- .../topi/python/test_topi_depthwise_conv2d.py | 23 +++++++++++++++++-- 8 files changed, 33 insertions(+), 13 deletions(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 1977c508d72e..fc7e4036341a 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -37,7 +37,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype Get default int8 schedule config for the workload """ wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=2, num_int8_elements=4) else: diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py index 60f5bcf30e49..4daa84c29528 100644 --- a/python/tvm/topi/generic/conv2d.py +++ b/python/tvm/topi/generic/conv2d.py @@ -87,9 +87,9 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements): This is related to input channel. """ pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr - HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 + HSTR, WSTR = wkl.stride_h, wkl.stride_w + out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1 + out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py index 1b9f52015716..052ab8b88d1c 100644 --- a/python/tvm/topi/nn/depthwise_conv2d.py +++ b/python/tvm/topi/nn/depthwise_conv2d.py @@ -55,11 +55,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype): _, in_channel, height, width = [x.value for x in data.shape] channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape] out_channel = channel * channel_multiplier - pt, pl, pb, pr = get_pad_tuple(padding, kernel) dilation_h, dilation_w = ( dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) ) - if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: @@ -70,6 +68,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype): '{} vs. {}".format( data.dtype, kernel.dtype ) + dilated_kernel_h = (kh - 1) * dilation_h + 1 + dilated_kernel_w = (kw - 1) * dilation_w + 1 + pt, pl, pb, pr = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w)) return Workload( data.dtype, out_dtype, diff --git a/python/tvm/topi/testing/depthwise_conv2d_python.py b/python/tvm/topi/testing/depthwise_conv2d_python.py index 06f26ab3a2e4..2239c56134f5 100644 --- a/python/tvm/topi/testing/depthwise_conv2d_python.py +++ b/python/tvm/topi/testing/depthwise_conv2d_python.py @@ -65,7 +65,7 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding): 0 : (in_height - filter_height + 1) : stride_h, 0 : (in_width - filter_width + 1) : stride_w, ] - if padding == "SAME": + elif padding == "SAME": out_channel = in_channel * channel_multiplier out_height = np.int(np.ceil(float(in_height) / float(stride_h))) out_width = np.int(np.ceil(float(in_width) / float(stride_w))) diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py index fb58bd350177..5e63de329bba 100644 --- a/python/tvm/topi/x86/conv2d_avx_common.py +++ b/python/tvm/topi/x86/conv2d_avx_common.py @@ -59,8 +59,8 @@ def _fallback_schedule(cfg, wkl): def _fallback_schedule_int8(cfg, wkl): pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1 + HSTR, WSTR = wkl.stride_h, wkl.stride_w + out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 oc_bn = 16 assert wkl.out_filter % oc_bn == 0 diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py index 692d99e9b675..ca0d0b8b223c 100644 --- a/python/tvm/topi/x86/conv2d_int8.py +++ b/python/tvm/topi/x86/conv2d_int8.py @@ -46,7 +46,7 @@ def _get_default_config_int8( _fallback_schedule(cfg, wkl) else: wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8( cfg, wkl, int32_lanes=16, num_int8_elements=4 diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py index d05013bc2d27..a0225ef9e147 100644 --- a/python/tvm/topi/x86/depthwise_conv2d.py +++ b/python/tvm/topi/x86/depthwise_conv2d.py @@ -46,7 +46,7 @@ def _fallback_schedule(cfg, wkl): HSTR, WSTR = wkl.stride_h, wkl.stride_w dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 - out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 + out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py index 55d2fe0c4e52..804c486d27d7 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py @@ -23,6 +23,8 @@ from tvm.topi.utils import get_const_tuple from tvm.topi.nn.utils import get_pad_tuple from tvm.contrib.pickle_memoize import memoize +from tvm.topi.nn.depthwise_conv2d import _get_workload +from tvm.topi.x86.depthwise_conv2d import _fallback_schedule import tvm.testing @@ -116,8 +118,8 @@ def depthwise_conv2d_with_workload_nchw( if dilation == 1: # here we transform the padding argument from 'str' to 'tuple' , # because we need this to match the "workload" tuple to the records in TopHub - pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width)) - padding_args = (pad_h, pad_w) + padt, padl, padb, padr = get_pad_tuple(padding, (filter_height, filter_width)) + padding_args = (padt, padl, padb, padr) else: padding_args = padding @@ -205,6 +207,23 @@ def get_ref_data(): relu_scipy, ) = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(depthwise_conv2d_scipy.shape) + wkl = _get_workload( + Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype + ) + + # check if tile_ow candidates are the factors of the right output weight. + with tvm.target.Target(device): + cfg = autotvm.get_config() + _fallback_schedule(cfg, wkl) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + + if "llvm" in device: + verify_workload_padding() + input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx)