From 23e5505bdb2f1dd229069aff2c35c152e571aee1 Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Mon, 21 Dec 2020 21:01:28 +0000
Subject: [PATCH 01/13] added asymmetric padding to conv2d workload

---
 python/tvm/topi/generic/conv2d.py        | 10 +++++-----
 python/tvm/topi/nn/conv2d.py             | 10 ++++++----
 python/tvm/topi/x86/conv2d_avx_1x1.py    |  6 +++---
 python/tvm/topi/x86/conv2d_avx_common.py |  8 ++++----
 python/tvm/topi/x86/depthwise_conv2d.py  |  4 ++--
 5 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 7dd9aed7545d..0f5021bf9b2c 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -38,9 +38,9 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
         How many numbers of input int32/uint32 will be multiplied and reduced.
         This is related to input channel.
     """
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
 
     assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
         wkl.out_filter,
@@ -85,10 +85,10 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
         How many numbers of input int32/uint32 will be multiplied and reduced.
         This is related to input channel.
     """
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
 
     assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
         wkl.out_filter,
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 8d591a20839a..107e0f4dacca 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -40,8 +40,10 @@
         "out_filter",
         "hkernel",
         "wkernel",
-        "hpad",
-        "wpad",
+        "padt",
+        "padb",
+        "padl",
+        "padr",
         "hstride",
         "wstride",
     ],
@@ -170,7 +172,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
     else:
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
+    pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
@@ -182,7 +184,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
         '{} vs. {}".format(
         data.dtype, kernel.dtype
     )
-    return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+    return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, pt, pl, pb, pr, HSTR, WSTR)
 
 
 def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 3e5a12bc43b2..70c12f1aad2f 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -31,10 +31,10 @@
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 8d707445be05..75c466242498 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -27,9 +27,9 @@
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
@@ -56,9 +56,9 @@ def _fallback_schedule(cfg, wkl):
 
 
 def _fallback_schedule_int8(cfg, wkl):
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
 
     oc_bn = 16
     assert wkl.out_filter % oc_bn == 0
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index badba1a248e9..aec0a894a968 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -42,9 +42,9 @@ def _fallback_schedule(cfg, wkl):
     """
     simd_width = get_fp32_len()
 
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):

From f5744524331f35d29469932b07daa29af25c318c Mon Sep 17 00:00:00 2001
From: Perry Gibson <perry@gibsonic.org>
Date: Tue, 22 Dec 2020 14:50:27 +0000
Subject: [PATCH 02/13] fixed depthwise conv2d padding

---
 python/tvm/topi/nn/depthwise_conv2d.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index 72356821770d..2f56ae4d0c66 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -38,8 +38,10 @@
         "out_filter",
         "hkernel",
         "wkernel",
-        "hpad",
-        "wpad",
+        "padt",
+        "padb",
+        "padl",
+        "padr",
         "hstride",
         "wstride",
     ],
@@ -51,7 +53,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
     _, in_channel, height, width = [x.value for x in data.shape]
     channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape]
     out_channel = channel * channel_multiplier
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    pt, pl, pb, pr = get_pad_tuple(padding, kernel)
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
     else:
@@ -71,8 +73,10 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         out_channel,
         kh,
         kw,
-        HPAD,
-        WPAD,
+        pt,
+        pl,
+        pb,
+        pr
         HSTR,
         WSTR,
     )

From 81ec6c690b759c48599334009aba189b305a25da Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Wed, 23 Dec 2020 20:34:17 +0000
Subject: [PATCH 03/13] Added fix to include dilation in workload output width
 calculation

---
 python/tvm/topi/cuda/conv2d_int8.py           |  7 ++---
 python/tvm/topi/generic/conv2d.py             |  3 ++-
 python/tvm/topi/nn/conv2d.py                  | 27 ++++++++++++++++---
 python/tvm/topi/nn/depthwise_conv2d.py        | 12 ++++++---
 python/tvm/topi/x86/conv2d.py                 |  6 +++--
 python/tvm/topi/x86/conv2d_avx_1x1.py         |  8 ++++--
 python/tvm/topi/x86/conv2d_avx_common.py      |  5 +++-
 python/tvm/topi/x86/conv2d_int8.py            | 12 ++++++---
 python/tvm/topi/x86/depthwise_conv2d.py       |  5 +++-
 .../topi/python/test_topi_conv2d_int8.py      | 17 ++++++++++++
 10 files changed, 82 insertions(+), 20 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index 50a0e8b71661..001411d6e4c9 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -142,9 +142,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
     pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
 
     # compute the output shape
-    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
-    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
-
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1
     oshape = (batch, oc_chunk, out_height, out_width, oc_block)
 
     icc = te.reduce_axis((0, ic_chunk), name="ic_chunk")
diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 0f5021bf9b2c..94e863276fb8 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -40,7 +40,8 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
     """
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
+    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
+    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
         wkl.out_filter,
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 4fc4b54530b3..46bb05389fe0 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -41,9 +41,11 @@
         "hkernel",
         "wkernel",
         "padt",
-        "padb",
         "padl",
+        "padb",
         "padr",
+        "hdilation",
+        "wdilation",
         "hstride",
         "wstride",
     ],
@@ -156,7 +158,7 @@ def conv2d_infer_layout(workload, cfg):
     raise ValueError("missing register for topi.nn.conv2d_infer_layout")
 
 
-def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
+def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layout="NCHW"):
     """ Get the workload structure. """
     if data_layout == "NCHW":
         _, CI, IH, IW = get_const_tuple(data.shape)
@@ -173,6 +175,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
     pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
+    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
@@ -184,7 +187,25 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
         '{} vs. {}".format(
         data.dtype, kernel.dtype
     )
-    return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, pt, pl, pb, pr, HSTR, WSTR)
+    return Workload(
+        data.dtype,
+        out_dtype,
+        IH,
+        IW,
+        CI,
+        GRPS,
+        CO,
+        KH,
+        KW,
+        pt,
+        pl,
+        pb,
+        pr,
+        hdilation,
+        wdilation,
+        HSTR,
+        WSTR,
+    )
 
 
 def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index 2f56ae4d0c66..b13768d261ca 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -39,21 +39,25 @@
         "hkernel",
         "wkernel",
         "padt",
-        "padb",
         "padl",
+        "padb",
         "padr",
+        "hdilation",
+        "wdilation",
         "hstride",
         "wstride",
     ],
 )
 
 
-def _get_workload(data, kernel, stride, padding, out_dtype):
+def _get_workload(data, kernel, stride, padding, dilation, out_dtype):
     """ Get the workload structure. """
     _, in_channel, height, width = [x.value for x in data.shape]
     channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape]
     out_channel = channel * channel_multiplier
     pt, pl, pb, pr = get_pad_tuple(padding, kernel)
+    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
     else:
@@ -76,7 +80,9 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         pt,
         pl,
         pb,
-        pr
+        pr,
+        hdilation,
+        wdilation,
         HSTR,
         WSTR,
     )
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index a3b7e473415e..b5d037466902 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -69,8 +69,10 @@ def _conv2d_infer_layout(workload, cfg):
     idxdiv = tvm.tir.indexdiv
 
     pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
-    out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1
-    out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1
+    dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1
+    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
+    out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1
+    out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1
     tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
     in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic)
     in_layout = "NCHW%dc" % tile_ic
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 70c12f1aad2f..0a5b654bb4d2 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -33,8 +33,12 @@ def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
+    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_h = (wkl.hkernel - 1) * dh + 1
+    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+
+    out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1
+    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 75c466242498..0f8bcfc5ac9c 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -29,7 +29,10 @@ def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
+    _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+
+    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index 905ada68f277..68c1be7b21cf 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -33,7 +33,7 @@
 
 
 def _get_default_config_int8(
-    cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW"
+    cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW"
 ):
     """
     Get default schedule config for the workload
@@ -45,7 +45,7 @@ def _get_default_config_int8(
 
         _fallback_schedule(cfg, wkl)
     else:
-        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
+        wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
         is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
         if is_kernel_1x1:
             conv2d_generic.fallback_schedule_cpu_1x1_int8(
@@ -138,8 +138,11 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
     is_kernel_1x1 = kernel_height == 1 and kernel_width == 1
     pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width))
     sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    oh = (ih - kernel_height + pt + pb) // sh + 1
-    ow = (iw - kernel_width + pl + pr) // sw + 1
+    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_h = (wkl.hkernel - 1) * dh + 1
+    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+    oh = (ih - dilated_kernel_h + pt + pb) // sh + 1
+    ow = (iw - dilated_kernel_w + pl + pr) // sw + 1
 
     cfg.define_split("tile_ic", in_channel, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0)
     cfg.define_split("tile_oc", num_filter, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0)
@@ -159,6 +162,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
             ),
             strides,
             padding,
+            dilation,
             out_dtype,
         )
 
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index aec0a894a968..fa0edaa934d4 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -44,7 +44,10 @@ def _fallback_schedule(cfg, wkl):
 
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
+    _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+
+    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 1bf83eba53ac..ce96f83244d4 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -27,6 +27,8 @@
 from tvm.topi.nn.utils import get_pad_tuple
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm
+from tvm.topi.nn.conv2d import _get_workload
+from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8
 
 from common import Int8Fallback
 import tvm.testing
@@ -385,6 +387,18 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
+    def verify_fallback_schedule_cpu_padding():
+        _, _, out_height, out_width = get_const_tuple(c_np.shape)
+        wkl = _get_workload(data, kernel, (stride, stride), padding, dilation, dtype)
+        int32_lanes, num_int8_elements = num_filter, in_channel
+
+        # check if tile_ow candidates are the factors of the right output weight.
+        cfg = autotvm.get_config()
+        fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements)
+        ow_tile = np.prod(cfg["tile_ow"].size)
+
+        tvm.testing.assert_allclose(ow_tile, out_width)
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not tvm.testing.device_enabled(device):
@@ -436,6 +450,8 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
+    verify_fallback_schedule_cpu_padding()
+
     for device in ["cuda"]:
         check_device(device)
 
@@ -547,6 +563,7 @@ def test_conv2d_nchw():
         verify_conv2d_nchw_int8(1, 32, 149, 32, 3, 1, 0)
         verify_conv2d_nchw_int8(7, 32, 149, 32, 3, 1, 0)
         verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
+        verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 2, 2))
 
 
 def test_conv2d_nhwc():

From d72c06d2de14e732a7878bd5600aa1095bbb4855 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Wed, 23 Dec 2020 20:53:20 +0000
Subject: [PATCH 04/13] Added missing dilation to arm_cpu/conv2d_int8.py
 workload

---
 python/tvm/topi/arm_cpu/conv2d_int8.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 445b9ec0c113..1977c508d72e 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -32,11 +32,11 @@
 from .arm_utils import get_tiling_B_interleaved_t
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
+def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """
     Get default int8 schedule config for the workload
     """
-    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype)
     is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
     if is_kernel_1x1:
         conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=2, num_int8_elements=4)
@@ -65,6 +65,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
             te.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype),
             strides,
             padding,
+            dilation,
             out_dtype,
         )
     return nn.conv2d_NCHWc_int8_compute(

From eea4f58060692a71b5213074e796fe88caf26033 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Wed, 23 Dec 2020 21:35:54 +0000
Subject: [PATCH 05/13] Fixed dilation for x86 conv2d

---
 python/tvm/topi/x86/conv2d.py            | 7 ++++---
 python/tvm/topi/x86/conv2d_avx_common.py | 3 +--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index b5d037466902..a751b1362f00 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -35,7 +35,7 @@
 
 
 def _get_default_config(
-    cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW"
+    cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW"
 ):
     """
     Get default schedule config for the workload
@@ -48,12 +48,12 @@ def _get_default_config(
             static_data_shape.append(dim)
     data = te.placeholder(static_data_shape, dtype=data.dtype)
     if is_depthwise:
-        wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype)
+        wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype)
         from .depthwise_conv2d import _fallback_schedule
 
         _fallback_schedule(cfg, wkl)
     else:
-        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
+        wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
         is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
         if is_kernel_1x1:
             conv2d_avx_1x1._fallback_schedule(cfg, wkl)
@@ -210,6 +210,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo
             ),
             strides,
             padding,
+            dilation,
             out_dtype,
         )
 
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 0f8bcfc5ac9c..625e85d561c9 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -29,8 +29,7 @@ def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
 
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 

From e7296ebde25a4cb478bb335f564e314a6db717d8 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Wed, 23 Dec 2020 22:04:57 +0000
Subject: [PATCH 06/13] Improved dilation workload integration in x86

---
 python/tvm/topi/x86/conv2d.py           | 4 ++--
 python/tvm/topi/x86/conv2d_avx_1x1.py   | 5 ++---
 python/tvm/topi/x86/depthwise_conv2d.py | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index a751b1362f00..9f2b9e1bd197 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -69,8 +69,8 @@ def _conv2d_infer_layout(workload, cfg):
     idxdiv = tvm.tir.indexdiv
 
     pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
-    dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1
-    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
+    dilated_kernel_h = (workload.hkernel - 1) * workload.hdilation + 1
+    dilated_kernel_w = (workload.wkernel - 1) * workload.wdilation + 1
     out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1
     out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1
     tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 0a5b654bb4d2..0b23d3ccc462 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -33,9 +33,8 @@ def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_h = (wkl.hkernel - 1) * dh + 1
-    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+    dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1
+    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
 
     out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index fa0edaa934d4..188e199f4ba1 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -44,8 +44,7 @@ def _fallback_schedule(cfg, wkl):
 
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    _, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
 
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
@@ -168,6 +167,7 @@ def depthwise_conv2d_NCHWc(
         ),
         strides,
         (pad_top, pad_down),
+        dilation,
         out_dtype,
     )
     if cfg.is_fallback:

From 3e5ca07f8acb8c852eedc810e8435c75c5f6a8a8 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Thu, 24 Dec 2020 11:52:47 +0000
Subject: [PATCH 07/13] Fixed x86 conv2d_alter_op to add dilation

---
 python/tvm/topi/x86/conv2d_alter_op.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index db3c232b6a7f..0426f383ae33 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -73,7 +73,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             if cfg.is_fallback:
                 _get_default_config(
-                    cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout
+                    cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout
                 )
             batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
             out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
@@ -118,7 +118,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         if cfg.is_fallback:
             _get_default_config_int8(
-                cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout
+                cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout
             )
 
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
@@ -174,7 +174,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             if cfg.is_fallback:
                 _get_default_config(
-                    cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout
+                    cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, True, data_layout
                 )
 
             batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)

From 9f7341a1bc12cc1173a05fee622ea0706e47645a Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Thu, 24 Dec 2020 12:45:19 +0000
Subject: [PATCH 08/13] Local linting not always producing same output as CI,
 probably my fault

---
 python/tvm/topi/x86/conv2d_alter_op.py | 30 +++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index 0426f383ae33..11f3b7dc67a7 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -73,7 +73,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             if cfg.is_fallback:
                 _get_default_config(
-                    cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout
+                    cfg,
+                    data_tensor,
+                    kernel_tensor,
+                    strides,
+                    padding,
+                    dilation,
+                    out_dtype,
+                    False,
+                    data_layout,
                 )
             batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
             out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
@@ -118,7 +126,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         if cfg.is_fallback:
             _get_default_config_int8(
-                cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, False, data_layout
+                cfg,
+                data_tensor,
+                kernel_tensor,
+                strides,
+                padding,
+                dilation,
+                out_dtype,
+                False,
+                data_layout,
             )
 
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
@@ -174,7 +190,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             if cfg.is_fallback:
                 _get_default_config(
-                    cfg, data_tensor, kernel_tensor, strides, padding, dilation, out_dtype, True, data_layout
+                    cfg,
+                    data_tensor,
+                    kernel_tensor,
+                    strides,
+                    padding,
+                    dilation,
+                    out_dtype,
+                    True,
+                    data_layout,
                 )
 
             batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)

From 4fe405d5a8ca6d57445005c0bfeb0c463b1b3798 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Thu, 24 Dec 2020 15:33:57 +0000
Subject: [PATCH 09/13] Fixed bug, tested locally

---
 python/tvm/topi/x86/conv2d.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index 9f2b9e1bd197..82f94d6cf8cc 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -69,8 +69,9 @@ def _conv2d_infer_layout(workload, cfg):
     idxdiv = tvm.tir.indexdiv
 
     pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
-    dilated_kernel_h = (workload.hkernel - 1) * workload.hdilation + 1
-    dilated_kernel_w = (workload.wkernel - 1) * workload.wdilation + 1
+    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_h = (k_height - 1) * hdilation + 1
+    dilated_kernel_w = (k_width - 1) * wdilation + 1
     out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1
     out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1
     tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

From 297bb358ee30118eb1dddbbd0ec7945656659184 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Fri, 25 Dec 2020 13:05:50 +0000
Subject: [PATCH 10/13] Abusing CI until I can figure out how to reproduce the
 same behaviour of running integration tests locally.

---
 python/tvm/topi/x86/conv2d_int8.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index 68c1be7b21cf..692d99e9b675 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -139,8 +139,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
     pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width))
     sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_h = (wkl.hkernel - 1) * dh + 1
-    dilated_kernel_w = (wkl.wkernel - 1) * dw + 1
+    dilated_kernel_h = (kernel_height - 1) * dh + 1
+    dilated_kernel_w = (kernel_width - 1) * dw + 1
     oh = (ih - dilated_kernel_h + pt + pb) // sh + 1
     ow = (iw - dilated_kernel_w + pl + pr) // sw + 1
 

From c221db20a0dfe036e2de800e4f75ba278bb482e9 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Sat, 26 Dec 2020 15:57:35 +0000
Subject: [PATCH 11/13] Ammeded conv2d_int8 test

---
 tests/python/topi/python/test_topi_conv2d_int8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index ce96f83244d4..2870ecb8442f 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -389,7 +389,7 @@ def get_ref_data():
 
     def verify_fallback_schedule_cpu_padding():
         _, _, out_height, out_width = get_const_tuple(c_np.shape)
-        wkl = _get_workload(data, kernel, (stride, stride), padding, dilation, dtype)
+        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
         int32_lanes, num_int8_elements = num_filter, in_channel
 
         # check if tile_ow candidates are the factors of the right output weight.

From ea0acb282b373441a37880a1a7675432886399f8 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Tue, 29 Dec 2020 15:25:02 +0000
Subject: [PATCH 12/13] Updated workload, improved unit tests

---
 python/tvm/topi/generic/conv2d.py             |  4 ++--
 python/tvm/topi/nn/conv2d.py                  | 20 ++++++++++---------
 python/tvm/topi/nn/depthwise_conv2d.py        | 20 ++++++++++---------
 python/tvm/topi/x86/conv2d.py                 |  2 +-
 python/tvm/topi/x86/conv2d_avx_1x1.py         |  6 +++---
 python/tvm/topi/x86/conv2d_avx_common.py      |  4 ++--
 python/tvm/topi/x86/depthwise_conv2d.py       |  4 ++--
 .../topi/python/test_topi_conv2d_int8.py      | 10 +++++++---
 .../topi/python/test_topi_conv2d_nchw.py      | 17 ++++++++++++++++
 9 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 94e863276fb8..60f5bcf30e49 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -39,8 +39,8 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
         This is related to input channel.
     """
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 46bb05389fe0..7f6116d696c0 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -38,16 +38,16 @@
         "in_filter",
         "groups",
         "out_filter",
-        "hkernel",
-        "wkernel",
+        "kernel_h",
+        "kernel_w",
         "padt",
         "padl",
         "padb",
         "padr",
-        "hdilation",
-        "wdilation",
-        "hstride",
-        "wstride",
+        "dilation_h",
+        "dilation_w",
+        "stride_h",
+        "stride_w",
     ],
 )
 
@@ -175,7 +175,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
     pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
-    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilation_h, dilation_w = (
+        dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    )
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
@@ -201,8 +203,8 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layou
         pl,
         pb,
         pr,
-        hdilation,
-        wdilation,
+        dilation_h,
+        dilation_w,
         HSTR,
         WSTR,
     )
diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index b13768d261ca..1b9f52015716 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -36,16 +36,16 @@
         "width",
         "in_filter",
         "out_filter",
-        "hkernel",
-        "wkernel",
+        "kernel_h",
+        "kernel_w",
         "padt",
         "padl",
         "padb",
         "padr",
-        "hdilation",
-        "wdilation",
-        "hstride",
-        "wstride",
+        "dilation_h",
+        "dilation_w",
+        "stride_h",
+        "stride_w",
     ],
 )
 
@@ -56,7 +56,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype):
     channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape]
     out_channel = channel * channel_multiplier
     pt, pl, pb, pr = get_pad_tuple(padding, kernel)
-    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilation_h, dilation_w = (
+        dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    )
 
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
@@ -81,8 +83,8 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype):
         pl,
         pb,
         pr,
-        hdilation,
-        wdilation,
+        dilation_h,
+        dilation_w,
         HSTR,
         WSTR,
     )
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index 82f94d6cf8cc..182454acf3a6 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -54,7 +54,7 @@ def _get_default_config(
         _fallback_schedule(cfg, wkl)
     else:
         wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
-        is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+        is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
         if is_kernel_1x1:
             conv2d_avx_1x1._fallback_schedule(cfg, wkl)
         else:
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 0b23d3ccc462..afee03a9f6a0 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -32,9 +32,9 @@
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    dilated_kernel_h = (wkl.hkernel - 1) * wkl.hdilation + 1
-    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
 
     out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 625e85d561c9..fb58bd350177 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -28,8 +28,8 @@
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
 
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index 188e199f4ba1..d05013bc2d27 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -43,8 +43,8 @@ def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
 
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    dilated_kernel_w = (wkl.wkernel - 1) * wkl.wdilation + 1
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
 
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 2870ecb8442f..a934e3ef2fd2 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -114,7 +114,7 @@ def compile_conv2d_NHWC_gemm_int8_arm(
                 s,
                 [A, W, bias, C],
                 device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
         else:
@@ -387,9 +387,13 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def verify_fallback_schedule_cpu_padding():
+    def verify_workload_padding():
         _, _, out_height, out_width = get_const_tuple(c_np.shape)
         wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
+
+        # for testing functionality,
+        # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
+        # regardless of the performance.
         int32_lanes, num_int8_elements = num_filter, in_channel
 
         # check if tile_ow candidates are the factors of the right output weight.
@@ -450,7 +454,7 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    verify_fallback_schedule_cpu_padding()
+    verify_workload_padding()
 
     for device in ["cuda"]:
         check_device(device)
diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py
index 1b7575211dac..07ad45c971df 100644
--- a/tests/python/topi/python/test_topi_conv2d_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_nchw.py
@@ -25,6 +25,8 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.nn.utils import get_pad_tuple
 from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.conv2d import _get_workload
+from tvm.topi.x86.conv2d_avx_common import _fallback_schedule
 
 import tvm.testing
 
@@ -76,6 +78,17 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
+    def verify_workload_padding():
+        _, _, out_height, out_width = get_const_tuple(c_np.shape)
+        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
+
+        # check if tile_ow candidates are the factors of the right output weight.
+        cfg = autotvm.get_config()
+        _fallback_schedule(cfg, wkl)
+        ow_tile = np.prod(cfg["tile_ow"].size)
+
+        tvm.testing.assert_allclose(ow_tile, out_width)
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not tvm.testing.device_enabled(device):
@@ -101,6 +114,9 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
+            if "llvm" in device:
+                verify_workload_padding()
+
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(b_np, ctx)
@@ -242,6 +258,7 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 64, 8, 64, 5, 2, (1, 3), add_bias=True)
     verify_conv2d_nchw(1, 64, 8, 64, 3, 1, "VALID", add_bias=True, add_relu=True)
     verify_conv2d_nchw(1, 64, 8, 64, 24, 1, "SAME", add_bias=True, add_relu=True)
+    verify_conv2d_nchw(1, 32, 35, 64, 7, 2, (0, 0, 2, 2))
 
 
 if __name__ == "__main__":

From 8e24562e9add83db840cf8a85dee4e1dcd7040c3 Mon Sep 17 00:00:00 2001
From: "Perry Gibson (gabriel)" <p.gibson.2@research.gla.ac.uk>
Date: Tue, 29 Dec 2020 16:50:23 +0000
Subject: [PATCH 13/13] Added depthwise conv2d workload test

---
 python/tvm/topi/arm_cpu/conv2d_int8.py        |  2 +-
 python/tvm/topi/generic/conv2d.py             |  6 ++---
 python/tvm/topi/nn/depthwise_conv2d.py        |  5 ++--
 .../topi/testing/depthwise_conv2d_python.py   |  2 +-
 python/tvm/topi/x86/conv2d_avx_common.py      |  4 ++--
 python/tvm/topi/x86/conv2d_int8.py            |  2 +-
 python/tvm/topi/x86/depthwise_conv2d.py       |  2 +-
 .../topi/python/test_topi_depthwise_conv2d.py | 23 +++++++++++++++++--
 8 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 1977c508d72e..fc7e4036341a 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -37,7 +37,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype
     Get default int8 schedule config for the workload
     """
     wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype)
-    is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+    is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
     if is_kernel_1x1:
         conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=2, num_int8_elements=4)
     else:
diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 60f5bcf30e49..4daa84c29528 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -87,9 +87,9 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
         This is related to input channel.
     """
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + pt + pb - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1
 
     assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
         wkl.out_filter,
diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index 1b9f52015716..052ab8b88d1c 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -55,11 +55,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype):
     _, in_channel, height, width = [x.value for x in data.shape]
     channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape]
     out_channel = channel * channel_multiplier
-    pt, pl, pb, pr = get_pad_tuple(padding, kernel)
     dilation_h, dilation_w = (
         dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     )
-
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
     else:
@@ -70,6 +68,9 @@ def _get_workload(data, kernel, stride, padding, dilation, out_dtype):
         '{} vs. {}".format(
         data.dtype, kernel.dtype
     )
+    dilated_kernel_h = (kh - 1) * dilation_h + 1
+    dilated_kernel_w = (kw - 1) * dilation_w + 1
+    pt, pl, pb, pr = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
     return Workload(
         data.dtype,
         out_dtype,
diff --git a/python/tvm/topi/testing/depthwise_conv2d_python.py b/python/tvm/topi/testing/depthwise_conv2d_python.py
index 06f26ab3a2e4..2239c56134f5 100644
--- a/python/tvm/topi/testing/depthwise_conv2d_python.py
+++ b/python/tvm/topi/testing/depthwise_conv2d_python.py
@@ -65,7 +65,7 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
                     0 : (in_height - filter_height + 1) : stride_h,
                     0 : (in_width - filter_width + 1) : stride_w,
                 ]
-    if padding == "SAME":
+    elif padding == "SAME":
         out_channel = in_channel * channel_multiplier
         out_height = np.int(np.ceil(float(in_height) / float(stride_h)))
         out_width = np.int(np.ceil(float(in_width) / float(stride_w)))
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index fb58bd350177..5e63de329bba 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -59,8 +59,8 @@ def _fallback_schedule(cfg, wkl):
 
 def _fallback_schedule_int8(cfg, wkl):
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + pl + pr - wkl.wkernel) // WSTR + 1
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1
 
     oc_bn = 16
     assert wkl.out_filter % oc_bn == 0
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index 692d99e9b675..ca0d0b8b223c 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -46,7 +46,7 @@ def _get_default_config_int8(
         _fallback_schedule(cfg, wkl)
     else:
         wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
-        is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+        is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
         if is_kernel_1x1:
             conv2d_generic.fallback_schedule_cpu_1x1_int8(
                 cfg, wkl, int32_lanes=16, num_int8_elements=4
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index d05013bc2d27..a0225ef9e147 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -46,7 +46,7 @@ def _fallback_schedule(cfg, wkl):
     HSTR, WSTR = wkl.stride_h, wkl.stride_w
     dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
 
-    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
+    out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index 55d2fe0c4e52..804c486d27d7 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -23,6 +23,8 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize
+from tvm.topi.nn.depthwise_conv2d import _get_workload
+from tvm.topi.x86.depthwise_conv2d import _fallback_schedule
 
 import tvm.testing
 
@@ -116,8 +118,8 @@ def depthwise_conv2d_with_workload_nchw(
     if dilation == 1:
         # here we transform the padding argument from 'str' to  'tuple' ,
         # because we need this to match the "workload" tuple to the records in TopHub
-        pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
-        padding_args = (pad_h, pad_w)
+        padt, padl, padb, padr = get_pad_tuple(padding, (filter_height, filter_width))
+        padding_args = (padt, padl, padb, padr)
     else:
         padding_args = padding
 
@@ -205,6 +207,23 @@ def get_ref_data():
                 relu_scipy,
             ) = get_ref_data()
 
+            def verify_workload_padding():
+                _, _, out_height, out_width = get_const_tuple(depthwise_conv2d_scipy.shape)
+                wkl = _get_workload(
+                    Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype
+                )
+
+                # check if tile_ow candidates are the factors of the right output weight.
+                with tvm.target.Target(device):
+                    cfg = autotvm.get_config()
+                    _fallback_schedule(cfg, wkl)
+                    ow_tile = np.prod(cfg["tile_ow"].size)
+
+                    tvm.testing.assert_allclose(ow_tile, out_width)
+
+            if "llvm" in device:
+                verify_workload_padding()
+
             input_tvm = tvm.nd.array(input_np, ctx)
             filter_tvm = tvm.nd.array(filter_np, ctx)
             scale_tvm = tvm.nd.array(scale_np, ctx)