From 8670894154de5710e19347ce7c9fa73c070b655c Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 24 Mar 2020 16:33:14 -0700
Subject: [PATCH 01/28] Functional conv3d winograd working.

---
 topi/python/topi/cuda/__init__.py        |   1 +
 topi/python/topi/cuda/conv3d_winograd.py | 329 +++++++++++++++++++++++
 2 files changed, 330 insertions(+)
 create mode 100644 topi/python/topi/cuda/conv3d_winograd.py

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 302171ee6466..ad385a78fb28 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -31,6 +31,7 @@
 from .conv2d_transpose_nchw import *
 from .deformable_conv2d import *
 from .conv3d import *
+from .conv3d_winograd import *
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
new file mode 100644
index 000000000000..5d7ca322b831
--- /dev/null
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -0,0 +1,329 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Winograd template for cuda backend"""
+
+import logging
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from .. import nn
+from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..nn.winograd_util import winograd_transform_matrices
+
+
+logger = logging.getLogger('conv3d_winograd')
+
+def _infer_tile_size(data, kernel):
+    N, CI, D, H, W = get_const_tuple(data.shape)
+
+    if D % 8 == 0:
+        return 4
+    return 2
+
+def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
+                  pre_computed):
+    """Compute declaration for winograd"""
+    tile_size = _infer_tile_size(data, kernel)
+
+    N, CI, D, H, W = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_d = dilation_h = dilation_w = dilation
+    else:
+        dilation_d, dilation_h, dilation_w = dilation
+    DSTR, HSTR, WSTR = (strides, strides, strides) if isinstance(strides, int) else strides
+
+    if not pre_computed: # kernel tensor is raw tensor, do strict check
+        if dilation_d != 1 or dilation_h != 1 or dilation_w != 1:
+            kernel = nn.dilate(kernel, (1, 1, dilation_d, dilation_h, dilation_w))
+        CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
+        alpha = KW + tile_size - 1
+        assert DSTR == 1 and HSTR == 1 and WSTR == 1 and KD == KH and KH == KW
+    else:
+        # kernel tensor is pre-transfomred. this op is created by alter op layout.
+        # dilation is not supported
+        alpha, _, _, CI, CO = get_const_tuple(kernel.shape)
+        KD = KH = KW = alpha + 1 - tile_size
+        assert DSTR == 1 and HSTR == 1 and WSTR == 1 and dilation_d == 1 and dilation_h == 1 and dilation_w == 1
+
+    pf, pt, pl, pb, pd, pr = nn.get_pad_tuple3d(padding, (KD, KH, KW))
+    data_pad = nn.pad(data, (0, 0, pf, pt, pl), (0, 0, pb, pd, pr), name="data_pad")
+
+    r = KW
+    m = tile_size
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    D = (D + pf + pb - KD) // DSTR + 1
+    H = (H + pt + pd - KH) // HSTR + 1
+    W = (W + pl + pr - KW) // WSTR + 1
+    nD, nH, nW = (D + m-1) // m, (H + m-1) // m, (W + m-1) // m
+    P = N * nD * nH * nW
+
+    # transform kernel
+    if not pre_computed:
+        r_kd = te.reduce_axis((0, KD), name='r_kd')
+        r_kh = te.reduce_axis((0, KH), name='r_kh')
+        r_kw = te.reduce_axis((0, KW), name='r_kw')
+        kernel_pack = te.compute((alpha, alpha, alpha, CI, CO), lambda omg, eps, nu, ci, co:
+                                 te.sum(kernel[co][ci][r_kd][r_kh][r_kw] *
+                                        G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
+                                        axis=[r_kd, r_kh, r_kw]), name='kernel_pack')
+    else:
+        kernel_pack = kernel
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+    # pack input tile
+    input_tile = te.compute((CI, P, alpha, alpha, alpha), lambda c, p, omg, eps, nu:
+                            data_pad[idxdiv(p, (nD * nH * nW))][c]
+                            [idxmod(idxdiv(p, nH * nW), nD) * m + omg]
+                            [idxmod(idxdiv(p, nW), nH) * m + eps]
+                            [idxmod(p, nW) * m + nu], name='d')
+
+
+    # transform data
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_b')
+    r_c = te.reduce_axis((0, alpha), 'r_c')
+    data_pack = te.compute((alpha, alpha, alpha, CI, P), lambda omg, eps, nu, ci, p:
+                           te.sum(input_tile[ci][p][r_a][r_b][r_c] * B[r_a][omg] * B[r_b][eps] * B[r_c][nu],
+                                  axis=[r_a, r_b, r_c]), name='data_pack')
+
+
+    # do batch gemm
+    ci = te.reduce_axis((0, CI), name='ci')
+    bgemm = te.compute((alpha, alpha, alpha, CO, P), lambda omg, eps, nu, co, p:
+                       te.sum(kernel_pack[omg][eps][nu][ci][co] *
+                              data_pack[omg][eps][nu][ci][p],
+                              axis=[ci]), name='bgemm')
+
+    # inverse transform
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_b')
+    r_c = te.reduce_axis((0, alpha), 'r_c')
+    inverse = te.compute((CO, P, m, m, m), lambda co, p, vd, vh, vw:
+                         te.sum(bgemm[r_a][r_b][r_c][co][p] * A[r_a][vd] * A[r_b][vh] * A[r_c][vw],
+                                axis=[r_a, r_b, r_c]), name='inverse')
+
+    # output
+    output = te.compute((N, CO, D, H, W), lambda n, co, d, h, w:
+                        inverse[co,
+                                n * nD * nH * nW + idxdiv(d, m) * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
+                                idxmod(d, m),
+                                idxmod(h, m),
+                                idxmod(w, m)],
+                        name='output', tag='conv3d_ncdhw_winograd')
+    cfg.add_flop(2 * N * CO * D* H * W * CI * KD * KH * KW)
+
+    return output
+
+
+def schedule_winograd_cuda(cfg, s, output, pre_computed):
+    """Schedule winograd template"""
+    # get stages
+    inverse = s[output].op.input_tensors[0]
+    bgemm, A = s[inverse].op.input_tensors
+    kernel_pack, data_pack = s[bgemm].op.input_tensors
+    input_tile, B = s[data_pack].op.input_tensors
+    pad_data = s[input_tile].op.input_tensors[0]
+
+    # data transform
+    s[B].compute_inline()
+
+    data_l = s.cache_write(data_pack, 'local')
+    omg, eps, nu, c, p = s[data_l].op.axis
+    r_a, r_b, r_c = s[data_l].op.reduce_axis
+    for axis in [omg, eps, nu, r_a, r_b, r_c]:
+        s[data_l].unroll(axis)
+
+    omg, eps, nu, c, p = s[data_pack].op.axis
+    p, pi = s[data_pack].split(p, 1)
+    fused = s[data_pack].fuse(c, p)
+    bb, tt = s[data_pack].split(fused, 128)
+    s[data_pack].reorder(bb, tt, pi, omg, eps, nu)
+    s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
+    s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
+
+    s[data_l].compute_at(s[data_pack], pi)
+    s[input_tile].compute_at(s[data_pack], pi)
+    s[pad_data].compute_inline()
+
+    # transform kernel
+    if not pre_computed:
+        kernel, G = s[kernel_pack].op.input_tensors
+        omg, eps, nu, ci, co = s[kernel_pack].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # skip this part during tuning to make recrods accurate
+            # this part will be pre-computed during pre-compute optimization pass
+            s[G].pragma(s[G].op.axis[0], 'debug_skip_region')
+            s[kernel_pack].pragma(eps, 'debug_skip_region')
+        else:
+            s[G].compute_inline()
+            r_a, r_b, r_c = s[kernel_pack].op.reduce_axis
+            for axis in [omg, eps, nu, r_a, r_b, r_c]:
+                s[kernel_pack].unroll(axis)
+
+            fused = s[kernel_pack].fuse(ci, co)
+            bb, tt = s[kernel_pack].split(fused, 128)
+            s[kernel_pack].reorder(bb, tt, omg, eps, nu, r_a, r_b, r_c)
+            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
+            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
+    else:
+        kernel = kernel_pack
+
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    ##### space definition begin #####
+    b1, b2, b3, y, x = s[bgemm].op.axis
+    rc = s[bgemm].op.reduce_axis[0]
+    alpha = get_const_int(b1.dom.extent)
+
+    cfg.define_split("tile_b", cfg.axis(alpha * alpha * alpha), num_outputs=4,
+                     filter=lambda x: x.size[-3:] == [1, 1, 1])
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
+    target = tvm.target.Target.current()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
+    # batch gemm
+    C = bgemm
+    A0, B0 = kernel_pack, data_pack
+
+    OL = s.cache_write(C, 'local')
+    AA = s.cache_read(A0, 'shared', [OL])
+    BB = s.cache_read(B0, 'shared', [OL])
+
+    b = s[bgemm].fuse(b1, b2, b3)
+
+    # tile and bind spatial axes
+    bgemm_scope, b = s[bgemm].split(b, nparts=1)
+    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].bind(bz, te.thread_axis("blockIdx.z"))
+    s[C].bind(by, te.thread_axis("blockIdx.y"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(vz, te.thread_axis("vthread"))
+    s[C].bind(vy, te.thread_axis("vthread"))
+    s[C].bind(vx, te.thread_axis("vthread"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
+
+    # tile reduction axes
+    s[OL].compute_at(s[C], tx)
+    b1, b2, b3, y, x = s[OL].op.axis
+    b = s[OL].fuse(b1, b2, b3)
+    rc, = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    s[OL].reorder(rco, rci, b, y, x)
+
+    s[AA].compute_at(s[OL], rco)
+    s[BB].compute_at(s[OL], rco)
+
+    # cooperative fetching
+    for load in [AA, BB]:
+        fused = s[load].fuse(*list(s[load].op.axis))
+        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
+        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
+        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+
+    s[C].pragma(bgemm_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[C].pragma(bgemm_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    # schedule inverse, output and fusion
+    if output.op in s.outputs:
+        OL = None
+    else:
+        OL = output
+        s[OL].set_scope('local')
+        output = s.outputs[0]
+
+    m = alpha - 3 + 1
+    n, co, d, h, w = s[output].op.axis
+    do, di = s[output].split(d, m)
+    ho, hi = s[output].split(w, m)
+    wo, wi = s[output].split(h, m)
+    s[output].reorder(n, co, do, ho, wo, di, hi, wi)
+    inverse_scope, n = s[output].split(n, nparts=1)
+
+    fused = s[output].fuse(n, co, do, ho, wo)
+    bb, tt = s[output].split(fused, 128)
+
+    s[output].bind(bb, te.thread_axis("blockIdx.x"))
+    s[output].bind(tt, te.thread_axis("threadIdx.x"))
+
+    if OL is not None:
+        s[OL].compute_at(s[output], tt)
+
+    s[A].compute_inline()
+    co, p, vd, vh, vw = s[inverse].op.axis
+    r_a, r_b, r_c = s[inverse].op.reduce_axis
+    for axis in [vd, vh, vw, r_a, r_b, r_c]:
+        s[inverse].unroll(axis)
+    s[inverse].compute_at(s[output], tt)
+
+    return s
+
+@autotvm.register_topi_compute("conv3d_ncdhw_winograd.cuda")
+def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
+                         pre_computed=False)
+
+@autotvm.register_topi_schedule("conv3d_ncdhw_winograd.cuda")
+def schedule_conv3d_ncdhw_winograd(cfg, outs):
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv3d_ncdhw_winograd' in op.tag:
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_weight_transform.cuda")
+def conv3d_ncdhw_winograd_without_weight_transform(cfg, data, kernel, strides,
+                                                   padding, dilation, out_dtype):
+    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
+                         pre_computed=True)
+
+
+@autotvm.register_topi_schedule("conv3d_ncdhw_winograd_without_weight_transform.cuda")
+def schedule_conv3d_ncdhw_winograd_without_weight_transform(cfg, outs):
+    """TOPI schedule callback"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv3d_ncdhw_winograd' in op.tag:
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s

From fc2486451610d32d1137a0708c53220f28cf558d Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 24 Mar 2020 16:46:52 -0700
Subject: [PATCH 02/28] Formatted python code.

---
 topi/python/topi/cuda/conv3d_winograd.py | 94 ++++++++++++++----------
 1 file changed, 54 insertions(+), 40 deletions(-)

diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 5d7ca322b831..0097918210a6 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -26,9 +26,9 @@
 from ..util import get_const_int, get_const_tuple, traverse_inline
 from ..nn.winograd_util import winograd_transform_matrices
 
-
 logger = logging.getLogger('conv3d_winograd')
 
+
 def _infer_tile_size(data, kernel):
     N, CI, D, H, W = get_const_tuple(data.shape)
 
@@ -36,8 +36,8 @@ def _infer_tile_size(data, kernel):
         return 4
     return 2
 
-def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
-                  pre_computed):
+
+def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed):
     """Compute declaration for winograd"""
     tile_size = _infer_tile_size(data, kernel)
 
@@ -49,7 +49,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
         dilation_d, dilation_h, dilation_w = dilation
     DSTR, HSTR, WSTR = (strides, strides, strides) if isinstance(strides, int) else strides
 
-    if not pre_computed: # kernel tensor is raw tensor, do strict check
+    if not pre_computed:  # kernel tensor is raw tensor, do strict check
         if dilation_d != 1 or dilation_h != 1 or dilation_w != 1:
             kernel = nn.dilate(kernel, (1, 1, dilation_d, dilation_h, dilation_w))
         CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
@@ -72,7 +72,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
     D = (D + pf + pb - KD) // DSTR + 1
     H = (H + pt + pd - KH) // HSTR + 1
     W = (W + pl + pr - KW) // WSTR + 1
-    nD, nH, nW = (D + m-1) // m, (H + m-1) // m, (W + m-1) // m
+    nD, nH, nW = (D + m - 1) // m, (H + m - 1) // m, (W + m - 1) // m
     P = N * nD * nH * nW
 
     # transform kernel
@@ -80,56 +80,65 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
         r_kd = te.reduce_axis((0, KD), name='r_kd')
         r_kh = te.reduce_axis((0, KH), name='r_kh')
         r_kw = te.reduce_axis((0, KW), name='r_kw')
-        kernel_pack = te.compute((alpha, alpha, alpha, CI, CO), lambda omg, eps, nu, ci, co:
-                                 te.sum(kernel[co][ci][r_kd][r_kh][r_kw] *
-                                        G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
-                                        axis=[r_kd, r_kh, r_kw]), name='kernel_pack')
+        kernel_pack = te.compute(
+            (alpha, alpha, alpha, CI, CO),
+            lambda omg, eps, nu, ci, co: te.sum(
+                kernel[co][ci][r_kd][r_kh][r_kw] * G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
+                axis=[r_kd, r_kh, r_kw]),
+            name='kernel_pack')
     else:
         kernel_pack = kernel
 
     idxdiv = tvm.tir.indexdiv
     idxmod = tvm.tir.indexmod
     # pack input tile
-    input_tile = te.compute((CI, P, alpha, alpha, alpha), lambda c, p, omg, eps, nu:
-                            data_pad[idxdiv(p, (nD * nH * nW))][c]
+    input_tile = te.compute((CI, P, alpha, alpha, alpha),
+                            lambda c, p, omg, eps, nu: data_pad[idxdiv(p, (nD * nH * nW))]
+                            [c]
                             [idxmod(idxdiv(p, nH * nW), nD) * m + omg]
                             [idxmod(idxdiv(p, nW), nH) * m + eps]
-                            [idxmod(p, nW) * m + nu], name='d')
-
+                            [idxmod(p, nW) * m + nu],
+                            name='d')
 
     # transform data
     r_a = te.reduce_axis((0, alpha), 'r_a')
     r_b = te.reduce_axis((0, alpha), 'r_b')
     r_c = te.reduce_axis((0, alpha), 'r_c')
-    data_pack = te.compute((alpha, alpha, alpha, CI, P), lambda omg, eps, nu, ci, p:
-                           te.sum(input_tile[ci][p][r_a][r_b][r_c] * B[r_a][omg] * B[r_b][eps] * B[r_c][nu],
-                                  axis=[r_a, r_b, r_c]), name='data_pack')
-
+    data_pack = te.compute(
+        (alpha, alpha, alpha, CI, P),
+        lambda omg, eps, nu, ci, p: te.sum(
+            input_tile[ci][p][r_a][r_b][r_c] * B[r_a][omg] * B[r_b][eps] * B[r_c][nu],
+            axis=[r_a, r_b, r_c]),
+        name='data_pack')
 
     # do batch gemm
     ci = te.reduce_axis((0, CI), name='ci')
-    bgemm = te.compute((alpha, alpha, alpha, CO, P), lambda omg, eps, nu, co, p:
-                       te.sum(kernel_pack[omg][eps][nu][ci][co] *
-                              data_pack[omg][eps][nu][ci][p],
-                              axis=[ci]), name='bgemm')
+    bgemm = te.compute(
+        (alpha, alpha, alpha, CO, P),
+        lambda omg, eps, nu, co, p: te.sum(
+            kernel_pack[omg][eps][nu][ci][co] * data_pack[omg][eps][nu][ci][p], axis=[ci]),
+        name='bgemm')
 
     # inverse transform
     r_a = te.reduce_axis((0, alpha), 'r_a')
     r_b = te.reduce_axis((0, alpha), 'r_b')
     r_c = te.reduce_axis((0, alpha), 'r_c')
-    inverse = te.compute((CO, P, m, m, m), lambda co, p, vd, vh, vw:
-                         te.sum(bgemm[r_a][r_b][r_c][co][p] * A[r_a][vd] * A[r_b][vh] * A[r_c][vw],
-                                axis=[r_a, r_b, r_c]), name='inverse')
+    inverse = te.compute((CO, P, m, m, m),
+                         lambda co, p, vd, vh, vw: te.sum(
+                             bgemm[r_a][r_b][r_c][co][p] * A[r_a][vd] * A[r_b][vh] * A[r_c][vw],
+                             axis=[r_a, r_b, r_c]),
+                         name='inverse')
 
     # output
-    output = te.compute((N, CO, D, H, W), lambda n, co, d, h, w:
-                        inverse[co,
-                                n * nD * nH * nW + idxdiv(d, m) * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-                                idxmod(d, m),
-                                idxmod(h, m),
-                                idxmod(w, m)],
-                        name='output', tag='conv3d_ncdhw_winograd')
-    cfg.add_flop(2 * N * CO * D* H * W * CI * KD * KH * KW)
+    output = te.compute((N, CO, D, H, W),
+                        lambda n, co, d, h, w: inverse[co, n * nD * nH * nW + idxdiv(d, m) * nH * nW
+                                                       + idxdiv(h, m) * nW + idxdiv(w, m),
+                                                       idxmod(d, m),
+                                                       idxmod(h, m),
+                                                       idxmod(w, m)],
+                        name='output',
+                        tag='conv3d_ncdhw_winograd')
+    cfg.add_flop(2 * N * CO * D * H * W * CI * KD * KH * KW)
 
     return output
 
@@ -195,8 +204,11 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     rc = s[bgemm].op.reduce_axis[0]
     alpha = get_const_int(b1.dom.extent)
 
-    cfg.define_split("tile_b", cfg.axis(alpha * alpha * alpha), num_outputs=4,
-                     filter=lambda x: x.size[-3:] == [1, 1, 1])
+    cfg.define_split(
+        "tile_b",
+        cfg.axis(alpha * alpha * alpha),
+        num_outputs=4,
+        filter=lambda x: x.size[-3:] == [1, 1, 1])
     cfg.define_split("tile_y", y, num_outputs=4)
     cfg.define_split("tile_x", x, num_outputs=4)
     cfg.define_split("tile_rc", rc, num_outputs=2)
@@ -292,10 +304,12 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
 
     return s
 
+
 @autotvm.register_topi_compute("conv3d_ncdhw_winograd.cuda")
 def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
-                         pre_computed=False)
+    return winograd_cuda(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
+
 
 @autotvm.register_topi_schedule("conv3d_ncdhw_winograd.cuda")
 def schedule_conv3d_ncdhw_winograd(cfg, outs):
@@ -310,10 +324,10 @@ def _callback(op):
 
 
 @autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_weight_transform.cuda")
-def conv3d_ncdhw_winograd_without_weight_transform(cfg, data, kernel, strides,
-                                                   padding, dilation, out_dtype):
-    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
-                         pre_computed=True)
+def conv3d_ncdhw_winograd_without_weight_transform(cfg, data, kernel, strides, padding, dilation,
+                                                   out_dtype):
+    return winograd_cuda(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
 
 
 @autotvm.register_topi_schedule("conv3d_ncdhw_winograd_without_weight_transform.cuda")

From c4da60737b1a393b6992f0a5db138dfed1cce981 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 24 Mar 2020 17:15:04 -0700
Subject: [PATCH 03/28] registered conv3d winograd compute and started adding
 relay without_weight_transform operator.

---
 include/tvm/relay/attrs/nn.h         |  8 ++++----
 python/tvm/relay/op/strategy/cuda.py | 11 +++++++++++
 src/relay/op/nn/convolution.cc       |  8 ++++----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 5794ddd0217b..51fc37aa255b 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -156,12 +156,12 @@ struct Conv2DAttrs : public tvm::AttrsNode<Conv2DAttrs> {
 };
 
 /*! \brief Attributes used in winograd weight transformation operators */
-struct Conv2DWinogradWeightTransformAttrs :
-    public tvm::AttrsNode<Conv2DWinogradWeightTransformAttrs> {
+struct ConvWinogradWeightTransformAttrs :
+    public tvm::AttrsNode<ConvWinogradWeightTransformAttrs> {
   int tile_size;
 
-  TVM_DECLARE_ATTRS(Conv2DWinogradWeightTransformAttrs,
-      "relay.attrs.Conv2DWinogradWeightTransformAttrs") {
+  TVM_DECLARE_ATTRS(ConvWinogradWeightTransformAttrs,
+      "relay.attrs.ConvWinogradWeightTransformAttrs") {
     TVM_ATTR_FIELD(tile_size)
       .describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
   }
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index db03c5965470..c3b2aef6b96b 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -234,12 +234,23 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
     """conv3d cuda strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.data_layout
+    stride_d, stride_h, stride_w = attrs.get_int_tuple("strides")
+    dilation_d, dilation_h, dilation_w = attrs.get_int_tuple("dilation")                                   
     assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout)
     if layout == "NCDHW":
         strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw),
                                     wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw),
                                     name="conv3d_ncdhw.cuda",
                                     plevel=10)
+        _, _, kd, kh, kw = get_const_tuple(kernel.shape)
+        if 2< kd < 8 and 2 < kh < 8 and 2 < kw < 8 and kd == kh and kh == kw and \
+            stride_d == 1 and stride_h == 1 and stride_w == 1 and \
+            dilation_d == 1 and dilation_h == 1 and dilation_w == 1:
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd),
+                wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd),
+                name="conv3d_ncdhw_winograd.cuda",
+                plevel=5)
     else: # layout == "NDHWC":
         strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
                                     wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc),
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 547d5a6ff692..f074baa77290 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -662,7 +662,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
         ConvInferCorrectLayout<Conv2DWinogradAttrs>);
 
 // relay.nn.contrib_conv2d_winograd_weight_transform
-TVM_REGISTER_NODE_TYPE(Conv2DWinogradWeightTransformAttrs);
+TVM_REGISTER_NODE_TYPE(ConvWinogradWeightTransformAttrs);
 
 bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
                                       int num_inputs,
@@ -672,7 +672,7 @@ bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
-  const Conv2DWinogradWeightTransformAttrs* param = attrs.as<Conv2DWinogradWeightTransformAttrs>();
+  const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
   CHECK(param != nullptr);
 
   CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
@@ -692,7 +692,7 @@ bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
 
 Expr MakeConv2DWinogradWeightTransform(Expr weight,
                                        int tile_size) {
-  auto attrs = make_object<Conv2DWinogradWeightTransformAttrs>();
+  auto attrs = make_object<ConvWinogradWeightTransformAttrs>();
   attrs->tile_size = tile_size;
   static const Op& op = Op::Get("nn.contrib_conv2d_winograd_weight_transform");
   return Call(op, {weight}, Attrs(attrs), {});
@@ -711,7 +711,7 @@ weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 )code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv2DWinogradWeightTransformAttrs>()
+.set_attrs_type<ConvWinogradWeightTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)

From 95e4b038a4bff10fa2dc4d13609e07b2640c037e Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 25 Mar 2020 17:37:42 -0700
Subject: [PATCH 04/28] Add topi testing for conv3d winograd.

---
 .../tests/python/test_topi_conv3d_winograd.py | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 topi/tests/python/test_topi_conv3d_winograd.py

diff --git a/topi/tests/python/test_topi_conv3d_winograd.py b/topi/tests/python/test_topi_conv3d_winograd.py
new file mode 100644
index 000000000000..b8c550c58c93
--- /dev/null
+++ b/topi/tests/python/test_topi_conv3d_winograd.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for 3d convolution with winograd."""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.nn.util import get_pad_tuple3d
+from topi.util import get_const_tuple
+
+from common import get_all_backend
+
+_conv3d_ncdhw_implement = {
+    "gpu": (topi.cuda.conv3d_ncdhw_winograd, topi.cuda.schedule_conv3d_ncdhw_winograd),
+}
+
+def verify_conv3d_ncdhw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(padding, (kernel, kernel, kernel))
+    padding_sum = pad_front + pad_back + pad_top + pad_left + pad_bottom + pad_right
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride,
+          padding_sum, dilation))
+
+    in_depth = in_height = in_width = in_size
+
+    A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel, kernel), name='W')
+    bias = te.placeholder((num_filter, 1, 1, 1), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv3d_ncdhw.verify_conv3d_ncdhw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation, dilation))
+        c_np = topi.testing.conv3d_ncdhw_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ncdhw_implement)
+        with tvm.target.create(device):
+            C = fcompute(A, W, (stride, stride, stride), padding,
+                         (dilation, dilation, dilation), dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = fschedule([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
+
+    for device in ["cuda"]:
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
+
+def test_conv3d_ncdhw():
+    #3DCNN  workloads
+    verify_conv3d_ncdhw(1, 61, 20, 120, 3, 1, 0)
+    verify_conv3d_ncdhw(1, 128, 12, 256, 3, 1, 1)
+    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1)
+
+    ### bias, relu
+    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, add_relu=True)
+    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, add_relu=True, add_bias=True)
+
+    ## dilation = 2
+    #verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, dilation=2)
+
+    ## batch size
+    verify_conv3d_ncdhw(4, 32, 12, 64, 3, 1, 1)
+
+    # weird workloads
+    verify_conv3d_ncdhw(2, 2, 2, 2, 3, 1, 2)
+    verify_conv3d_ncdhw(3, 3, 3, 3, 3, 1, 3)
+
+
+if __name__ == "__main__":
+    test_conv3d_ncdhw()

From 3b5f9cb705860b6bcdd567805335e996bcb96128 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 25 Mar 2020 17:38:28 -0700
Subject: [PATCH 05/28] Format file.

---
 .../tests/python/test_topi_conv3d_winograd.py | 36 ++++++++++++++-----
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/topi/tests/python/test_topi_conv3d_winograd.py b/topi/tests/python/test_topi_conv3d_winograd.py
index b8c550c58c93..669fd6305539 100644
--- a/topi/tests/python/test_topi_conv3d_winograd.py
+++ b/topi/tests/python/test_topi_conv3d_winograd.py
@@ -32,11 +32,22 @@
     "gpu": (topi.cuda.conv3d_ncdhw_winograd, topi.cuda.schedule_conv3d_ncdhw_winograd),
 }
 
-def verify_conv3d_ncdhw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
-    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(padding, (kernel, kernel, kernel))
+
+def verify_conv3d_ncdhw(batch,
+                        in_channel,
+                        in_size,
+                        num_filter,
+                        kernel,
+                        stride,
+                        padding,
+                        dilation=1,
+                        add_bias=False,
+                        add_relu=False):
+    pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(
+        padding, (kernel, kernel, kernel))
     padding_sum = pad_front + pad_back + pad_top + pad_left + pad_bottom + pad_right
-    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride,
-          padding_sum, dilation))
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
+          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
 
     in_depth = in_height = in_width = in_size
 
@@ -72,8 +83,8 @@ def check_device(device):
         print("Running on target: %s" % device)
         fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ncdhw_implement)
         with tvm.target.create(device):
-            C = fcompute(A, W, (stride, stride, stride), padding,
-                         (dilation, dilation, dilation), dtype)
+            C = fcompute(A, W, (stride, stride, stride), padding, (dilation, dilation, dilation),
+                         dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
@@ -85,10 +96,18 @@ def check_device(device):
         b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
         if add_bias:
-            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
+            func = tvm.build(
+                s, [A, W, bias, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
+                (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
             func(a, w, b, c)
         else:
-            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
+            func = tvm.build(
+                s, [A, W, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
+                (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
 
@@ -96,6 +115,7 @@ def check_device(device):
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)
 
+
 def test_conv3d_ncdhw():
     #3DCNN  workloads
     verify_conv3d_ncdhw(1, 61, 20, 120, 3, 1, 0)

From 077625c398942669f3f06638fd3c5b7e19a6585f Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Thu, 26 Mar 2020 14:40:55 -0700
Subject: [PATCH 06/28] small tweak to unrolling to prevent build sticking.

---
 python/tvm/relay/op/op_attrs.py                | 6 +++---
 topi/python/topi/cuda/conv3d_winograd.py       | 4 +++-
 topi/tests/python/test_topi_conv3d_winograd.py | 8 ++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index edc2160e38bc..a753c173ff10 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -34,9 +34,9 @@ class Conv2DWinogradAttrs(Attrs):
     """Attributes for nn.contrib_conv2d_winograd_without_weight_transform"""
 
 
-@tvm._ffi.register_object("relay.attrs.Conv2DWinogradWeightTransformAttrs")
-class Conv2DWinogradWeightTransformAttrs(Attrs):
-    """Attributes for nn.contrib_conv2d_winograd_weight_transform"""
+@tvm._ffi.register_object("relay.attrs.ConvWinogradWeightTransformAttrs")
+class ConvWinogradWeightTransformAttrs(Attrs):
+    """Attributes for nn.contrib_convNd_winograd_weight_transform"""
 
 
 @tvm._ffi.register_object("relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs")
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 0097918210a6..ba3d3b85de83 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -158,7 +158,9 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     data_l = s.cache_write(data_pack, 'local')
     omg, eps, nu, c, p = s[data_l].op.axis
     r_a, r_b, r_c = s[data_l].op.reduce_axis
-    for axis in [omg, eps, nu, r_a, r_b, r_c]:
+    # TODO unrolling by omg, eps, nu may improve performance but
+    # in some cases causes extremely long build times due to imperfect tiling.
+    for axis in [r_a, r_b, r_c]:
         s[data_l].unroll(axis)
 
     omg, eps, nu, c, p = s[data_pack].op.axis
diff --git a/topi/tests/python/test_topi_conv3d_winograd.py b/topi/tests/python/test_topi_conv3d_winograd.py
index 669fd6305539..28722fe749fc 100644
--- a/topi/tests/python/test_topi_conv3d_winograd.py
+++ b/topi/tests/python/test_topi_conv3d_winograd.py
@@ -122,14 +122,14 @@ def test_conv3d_ncdhw():
     verify_conv3d_ncdhw(1, 128, 12, 256, 3, 1, 1)
     verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1)
 
-    ### bias, relu
+    # bias, relu
     verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, add_relu=True)
     verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, add_relu=True, add_bias=True)
 
-    ## dilation = 2
-    #verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, dilation=2)
+    # dilation = 2
+    verify_conv3d_ncdhw(1, 16, 12, 16, 3, 1, "VALID", dilation=2)
 
-    ## batch size
+    # batch size
     verify_conv3d_ncdhw(4, 32, 12, 64, 3, 1, 1)
 
     # weird workloads

From 11b60aace33623d833ca49678c1cead9f010c9f0 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 27 Mar 2020 11:29:40 -0700
Subject: [PATCH 07/28] Refactoring convolution ops in relay.

---
 docs/langref/relay_op.rst       |   5 +
 include/tvm/relay/attrs/nn.h    |  63 ++++++++
 python/tvm/relay/op/op_attrs.py |   5 +
 src/relay/op/nn/convolution.cc  | 278 +++++++++++++++-----------------
 src/relay/op/nn/convolution.h   | 188 +++++++++++++++++++++
 5 files changed, 390 insertions(+), 149 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index ac636f81ab3d..f1d7d442a14c 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -82,8 +82,13 @@ This level enables typical convnet models.
    tvm.relay.nn.pad
    tvm.relay.nn.lrn
    tvm.relay.nn.l2_normalize
+   tvm.relay.nn.bitpack
+   tvm.relay.nn.bitserial_dense
+   tvm.relay.nn.bitserial_conv2d
    tvm.relay.nn.contrib_conv2d_winograd_without_weight_transform
    tvm.relay.nn.contrib_conv2d_winograd_weight_transform
+   tvm.relay.nn.contrib_conv3d_winograd_without_weight_transform
+   tvm.relay.nn.contrib_conv3d_winograd_weight_transform
 
 
 **Level 3: Additional Math And Transform Operators**
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 51fc37aa255b..6fbaadee772f 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -306,6 +306,69 @@ struct Conv3DAttrs : public tvm::AttrsNode<Conv3DAttrs> {
   }
 };
 
+/*! \brief Attributes used in 3d winograd convolution operators */
+struct Conv3DWinogradAttrs : public tvm::AttrsNode<Conv3DWinogradAttrs> {
+  int tile_size;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  int groups;
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  std::string data_layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(Conv3DWinogradAttrs, "relay.attrs.Conv3DWinogradAttrs") {
+    TVM_ATTR_FIELD(tile_size)
+      .describe("The tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1, 1}))
+        .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0, 0}))
+        .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                  "Padding support both symmetric and asymmetric as"
+                  "one int : same padding used on all sides"
+                  "three int : back, bottom, right will use same padding as front, top, left"
+                  "six int : padding width in the order of (front, top, left, back, bottom,"
+                  "right)");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1, 1}))
+        .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+        .describe("Controls the connections between inputs and outputs."
+                  "At groups=1, all inputs are convolved to all outputs."
+                  "At groups=2, the operation becomes equivalent to having two convolution"
+                  "layers side by side, each seeing half the input channels, and producing"
+                  "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(channels)
+        .describe("The number of output channels in the convolution."
+                  " If it is not set, inferred by shape of the weight.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(kernel_size)
+        .describe("Specifies the dimensions of the convolution window.")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(data_layout).set_default("NCDHW")
+        .describe("Dimension ordering of input data. Can be 'NCDHW', 'NDHWC', etc."
+                  "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
+                  "dimensions respectively. Convolution is applied on the 'D', 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(kernel_layout).set_default("OIDHW")
+        .describe("Dimension ordering of weight. Can be 'OIDHW', 'OIDHW16o16i', etc."
+                  "'O', 'I', 'D', 'H', 'W' stands for num_filter, input_channel, depth, height,"
+                  "and width dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("")
+        .describe("Dimension ordering of output. Can be 'NCDHW', 'NDHWC', etc."
+                  "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
+                  "dimensions respectively. Default to be same as input layout.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
+
+
 /*! \brief Attributes used in softmax operators */
 struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
   int axis;
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index a753c173ff10..26a76489bcd4 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -34,6 +34,11 @@ class Conv2DWinogradAttrs(Attrs):
     """Attributes for nn.contrib_conv2d_winograd_without_weight_transform"""
 
 
+@tvm._ffi.register_object("relay.attrs.Conv3DWinogradAttrs")
+class Conv3DWinogradAttrs(Attrs):
+    """Attributes for nn.contrib_conv3d_winograd_without_weight_transform"""
+
+
 @tvm._ffi.register_object("relay.attrs.ConvWinogradWeightTransformAttrs")
 class ConvWinogradWeightTransformAttrs(Attrs):
     """Attributes for nn.contrib_convNd_winograd_weight_transform"""
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index f074baa77290..e147eef4ab7f 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -63,6 +63,46 @@ Expr MakeConv(Expr data,
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
 
+template <typename T>
+Expr MakeConvWinograd(Expr data,
+                      Expr weight,
+                      int tile_size,
+                      Array<IndexExpr> strides,
+                      Array<IndexExpr> padding,
+                      Array<IndexExpr> dilation,
+                      int groups,
+                      IndexExpr channels,
+                      Array<IndexExpr> kernel_size,
+                      std::string data_layout,
+                      std::string kernel_layout,
+                      std::string out_layout,
+                      DataType out_dtype,
+                      std::string op_name) {
+  auto attrs = make_object<T>();
+  attrs->tile_size = tile_size;
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = std::move(channels);
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get(op_name);
+  return Call(op, {data, weight}, Attrs(attrs), {});
+}
+
+Expr MakeConvWinogradWeightTransform(Expr weight,
+                                     int tile_size,
+                                     std::string op_name) {
+  auto attrs = make_object<ConvWinogradWeightTransformAttrs>();
+  attrs->tile_size = tile_size;
+  static const Op& op = Op::Get(op_name);
+  return Call(op, {weight}, Attrs(attrs), {});
+}
+
 
 // relay.nn.conv1d
 TVM_REGISTER_NODE_TYPE(Conv1DAttrs);
@@ -522,122 +562,25 @@ said convolution.
 // relay.nn.contrib_conv2d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs);
 
-template<class Param>
-bool Conv2DWinogradRel(const Array<Type>& types,
-                       int num_inputs,
-                       const Attrs& attrs,
-                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-  static const Layout kNCHW("NCHW");
-  static const Layout kOIHW("OIHW");
-
-  const Param* param = attrs.as<Param>();
-  CHECK(param != nullptr);
-  const Layout in_layout(param->data_layout);
-  const Layout kernel_layout(param->kernel_layout);
-
-  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
-    << "Conv only support input layouts that are convertible from NCHW."
-    << " But got " << in_layout;
-
-  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
-    << "Conv only support kernel layouts that are convertible from OIHW."
-    << " But got "<< kernel_layout;
-
-  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
-      << "Conv only support output layouts that are convertible from NCHW."
-      << " But got " << out_layout;
-
-  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
-
-  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
-
-  CHECK(param->kernel_size.defined() && param->channels.defined())
-      << "The kernel size and channels of a Conv must be set or infered by previous pass";
-
-  CHECK_EQ(param->kernel_size.size(), 2);
-  CHECK_EQ(param->dilation.size(), 2);
-
-  channels = param->channels;
-  dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
-  dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
-
-  // NOTE: Do not check weight shape here!
-  // Different backend requires different layout to compute
-  // the batch gemm stage in winograd efficiently, but we want to
-  // make this op work for all backends.
-  // So we accept all weight shapes, and assume the TOPI developers
-  // can handle this correctly in alter_op_layout.
-
-  // dilation
-  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
-
-  IndexExpr pad_h, pad_w;
-  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
-  if (!dshape_nchw[2].as<tir::AnyNode>()) {
-    oshape.Set(2, (dshape_nchw[2] + pad_h
-                   - dilated_ksize_y) / param->strides[0] + 1);
-  } else {
-    oshape.Set(2, dshape_nchw[2]);
-  }
-  if (!dshape_nchw[3].as<tir::AnyNode>()) {
-    oshape.Set(3, (dshape_nchw[3] + pad_w
-                   - dilated_ksize_x) / param->strides[1] + 1);
-  } else {
-    oshape.Set(3, dshape_nchw[3]);
-  }
-
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
-  oshape = trans_out_layout.BackwardShape(oshape);
-  // assign output type
-  reporter->Assign(types[2], TensorType(oshape, out_dtype));
-  return true;
-}
-
-
-// Positional relay function to create conv2d winograd operator
-// used by frontend FFI.
-Expr MakeConv2DWinograd(Expr data,
-                        Expr weight,
-                        int tile_size,
-                        Array<IndexExpr> strides,
-                        Array<IndexExpr> padding,
-                        Array<IndexExpr> dilation,
-                        int groups,
-                        IndexExpr channels,
-                        Array<IndexExpr> kernel_size,
-                        std::string data_layout,
-                        std::string kernel_layout,
-                        std::string out_layout,
-                        DataType out_dtype) {
-  auto attrs = make_object<Conv2DWinogradAttrs>();
-  attrs->tile_size = tile_size;
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->channels = channels;
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.contrib_conv2d_winograd_without_weight_transform");
-  return Call(op, {data, weight}, Attrs(attrs), {});
-}
-
-
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_without_weight_transform")
-.set_body_typed(MakeConv2DWinograd);
+.set_body_typed([](Expr data,
+                   Expr weight,
+                   int tile_size,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   Array<IndexExpr> dilation,
+                   int groups,
+                   IndexExpr channels,
+                   Array<IndexExpr> kernel_size,
+                   std::string data_layout,
+                   std::string kernel_layout,
+                   std::string out_layout,
+                   DataType out_dtype) {
+  return MakeConvWinograd<Conv2DWinogradAttrs>(
+    data, weight, tile_size, strides, padding, dilation,
+    groups, channels, kernel_size, data_layout,
+    kernel_layout, out_layout, out_dtype, "nn.contrib.conv2d_winograd_without_weight_tranform");
+});
 
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
@@ -664,58 +607,95 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
 // relay.nn.contrib_conv2d_winograd_weight_transform
 TVM_REGISTER_NODE_TYPE(ConvWinogradWeightTransformAttrs);
 
-bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
-                                      int num_inputs,
-                                      const Attrs& attrs,
-                                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_weight_transform")
+.set_body_typed([](Expr weight,
+                   int tile_size) {
+  return MakeConvWinogradWeightTransform(
+    weight, tile_size, "nn.contrib.conv2d_winograd_weight_tranform");
+});
 
-  const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
-  CHECK(param != nullptr);
+RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
+.describe(R"code(Weight transformation of winograd fast convolution algorithm.
 
-  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+Separate this into another operator in order to enable Precompute Pass to compute the
+weight transformation in advance.
 
-  // each pad width element should be a pair of positive integers
-  std::vector<IndexExpr> oshape {
-      param->tile_size + data->shape[2] - 1,
-      param->tile_size + data->shape[3] - 1,
-      data->shape[0],
-      data->shape[1],
-  };
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+)code" TVM_ADD_FILELINE)
+.set_attrs_type<ConvWinogradWeightTransformAttrs>()
+.set_num_inputs(1)
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
 
-  reporter->Assign(types[1], TensorType(Array<IndexExpr>(oshape),
-                                                  data->dtype));
-  return true;
-}
 
-Expr MakeConv2DWinogradWeightTransform(Expr weight,
-                                       int tile_size) {
-  auto attrs = make_object<ConvWinogradWeightTransformAttrs>();
-  attrs->tile_size = tile_size;
-  static const Op& op = Op::Get("nn.contrib_conv2d_winograd_weight_transform");
-  return Call(op, {weight}, Attrs(attrs), {});
-}
+// relay.nn.contrib_conv3d_winograd_without_weight_transform
+TVM_REGISTER_NODE_TYPE(Conv3DWinogradAttrs);
 
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_without_weight_transform")
+.set_body_typed([](Expr data,
+                   Expr weight,
+                   int tile_size,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   Array<IndexExpr> dilation,
+                   int groups,
+                   IndexExpr channels,
+                   Array<IndexExpr> kernel_size,
+                   std::string data_layout,
+                   std::string kernel_layout,
+                   std::string out_layout,
+                   DataType out_dtype) {
+  return MakeConvWinograd<Conv3DWinogradAttrs>(
+    data, weight, tile_size, strides, padding, dilation,
+    groups, channels, kernel_size, data_layout,
+    kernel_layout, out_layout, out_dtype, "nn.contrib.conv3d_winograd_without_weight_tranform");
+});
 
-TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_weight_transform")
-.set_body_typed(MakeConv2DWinogradWeightTransform);
 
+RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
+.describe(R"code(Compute conv3d with winograd algorithm. Only supports NCDHW layout.
+                 This operator assumes the weight tensor is already pre-transformed by
+                 nn.contrib_conv3d_winograd_weight_transform.
 
-RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
-.describe(R"code(Weight transformation of winograd fast convolution algorithm.
+- **data**: Input is 5D array of shape  (batch_size, in_channels, depth, height, width)
+- **weight**: Any shape
+            We do not check the shape for this input tensor. Since different backend
+            has different layout strategy.
+
+- **out**:  Output is 5D array of shape (batch_size, channels, depth, out_height, out_width)
+)code" TVM_ADD_FILELINE)
+.set_attrs_type<Conv3DWinogradAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv3DWinograd", Conv3DWinogradRel<Conv3DWinogradAttrs>)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+        ConvInferCorrectLayout<Conv3DWinogradAttrs>);
+
+
+// relay.nn.contrib_conv3d_winograd_weight_transform
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform")
+.set_body_typed([](Expr weight,
+                   int tile_size) {
+  return MakeConvWinogradWeightTransform(
+    weight, tile_size, "nn.contrib.conv3d_winograd_weight_tranform");
+});
+
+RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_weight_transform")
+.describe(R"code(Weight transformation of winograd fast 3d convolution algorithm.
 
 Separate this into another operator in order to enable Precompute Pass to compute the
 weight transformation in advance.
 
-- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2])
 )code" TVM_ADD_FILELINE)
 .set_attrs_type<ConvWinogradWeightTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
-.add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
+.add_type_rel("Conv3DWinogradWeightTransform", Conv3DWinogradWeightTransformRel);
 
 
 // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 6a69178f49b1..f3b2f6b04f3f 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -360,6 +360,194 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
+bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
+                                      int num_inputs,
+                                      const Attrs& attrs,
+                                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+
+  // each pad width element should be a pair of positive integers
+  std::vector<IndexExpr> oshape {
+      param->tile_size + data->shape[2] - 1,
+      param->tile_size + data->shape[3] - 1,
+      data->shape[0],
+      data->shape[1],
+  };
+
+  reporter->Assign(types[1], TensorType(Array<IndexExpr>(oshape),
+                                                  data->dtype));
+  return true;
+}
+
+template<class Param>
+bool Conv2DWinogradRel(const Array<Type>& types,
+                       int num_inputs,
+                       const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Param* param = attrs.as<Param>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->kernel_layout);
+
+  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+
+  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
+  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
+      << "Conv only support output layouts that are convertible from NCHW."
+      << " But got " << out_layout;
+
+  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+
+  CHECK(param->kernel_size.defined() && param->channels.defined())
+      << "The kernel size and channels of a Conv must be set or infered by previous pass";
+
+  CHECK_EQ(param->kernel_size.size(), 2);
+  CHECK_EQ(param->dilation.size(), 2);
+
+  channels = param->channels;
+  dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+  dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+
+  // NOTE: Do not check weight shape here!
+  // Different backend requires different layout to compute
+  // the batch gemm stage in winograd efficiently, but we want to
+  // make this op work for all backends.
+  // So we accept all weight shapes, and assume the TOPI developers
+  // can handle this correctly in alter_op_layout.
+
+  // dilation
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+
+  IndexExpr pad_h, pad_w;
+  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
+  if (!dshape_nchw[2].as<tir::AnyNode>()) {
+    oshape.Set(2, (dshape_nchw[2] + pad_h
+                   - dilated_ksize_y) / param->strides[0] + 1);
+  } else {
+    oshape.Set(2, dshape_nchw[2]);
+  }
+  if (!dshape_nchw[3].as<tir::AnyNode>()) {
+    oshape.Set(3, (dshape_nchw[3] + pad_w
+                   - dilated_ksize_x) / param->strides[1] + 1);
+  } else {
+    oshape.Set(3, dshape_nchw[3]);
+  }
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = trans_out_layout.BackwardShape(oshape);
+  // assign output type
+  reporter->Assign(types[2], TensorType(oshape, out_dtype));
+  return true;
+}
+
+template<class Param>
+bool Conv2DWinogradRel(const Array<Type>& types,
+                       int num_inputs,
+                       const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Param* param = attrs.as<Param>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->kernel_layout);
+
+  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+
+  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
+  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
+      << "Conv only support output layouts that are convertible from NCHW."
+      << " But got " << out_layout;
+
+  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+
+  CHECK(param->kernel_size.defined() && param->channels.defined())
+      << "The kernel size and channels of a Conv must be set or infered by previous pass";
+
+  CHECK_EQ(param->kernel_size.size(), 2);
+  CHECK_EQ(param->dilation.size(), 2);
+
+  channels = param->channels;
+  dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+  dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+
+  // NOTE: Do not check weight shape here!
+  // Different backend requires different layout to compute
+  // the batch gemm stage in winograd efficiently, but we want to
+  // make this op work for all backends.
+  // So we accept all weight shapes, and assume the TOPI developers
+  // can handle this correctly in alter_op_layout.
+
+  // dilation
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+
+  IndexExpr pad_h, pad_w;
+  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
+  if (!dshape_nchw[2].as<tir::AnyNode>()) {
+    oshape.Set(2, (dshape_nchw[2] + pad_h
+                   - dilated_ksize_y) / param->strides[0] + 1);
+  } else {
+    oshape.Set(2, dshape_nchw[2]);
+  }
+  if (!dshape_nchw[3].as<tir::AnyNode>()) {
+    oshape.Set(3, (dshape_nchw[3] + pad_w
+                   - dilated_ksize_x) / param->strides[1] + 1);
+  } else {
+    oshape.Set(3, dshape_nchw[3]);
+  }
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = trans_out_layout.BackwardShape(oshape);
+  // assign output type
+  reporter->Assign(types[2], TensorType(oshape, out_dtype));
+  return true;
+}
+
 template<typename T>
 Array<Array<Layout> > ConvInferCorrectLayout(
     const Attrs& attrs,

From 3d0321240a7d68de7211e997ccb22b0cb4496157 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Sun, 29 Mar 2020 17:03:43 -0700
Subject: [PATCH 08/28] Refactored relay convolutions.

---
 src/relay/op/nn/convolution.cc | 584 ++++++++++-----------------------
 src/relay/op/nn/convolution.h  | 329 +++++++++++++++++--
 2 files changed, 465 insertions(+), 448 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index e147eef4ab7f..7cfd85747b47 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -103,6 +103,38 @@ Expr MakeConvWinogradWeightTransform(Expr weight,
   return Call(op, {weight}, Attrs(attrs), {});
 }
 
+template <typename T>
+Expr MakeDeformableConv(Expr data,
+                        Expr offset,
+                        Expr weight,
+                        Array<IndexExpr> strides,
+                        Array<IndexExpr> padding,
+                        Array<IndexExpr> dilation,
+                        int deformable_groups,
+                        int groups,
+                        int channels,
+                        Array<IndexExpr> kernel_size,
+                        std::string data_layout,
+                        std::string kernel_layout,
+                        std::string out_layout,
+                        DataType out_dtype,
+                        std::string op_name) {
+  auto attrs = make_object<T>();
+  attrs->strides = strides;
+  attrs->padding = padding;
+  attrs->dilation = dilation;
+  attrs->deformable_groups = deformable_groups;
+  attrs->groups = groups;
+  attrs->channels = channels;
+  attrs->kernel_size = kernel_size;
+  attrs->data_layout = data_layout;
+  attrs->kernel_layout = kernel_layout;
+  attrs->out_layout = out_layout;
+  attrs->out_dtype = out_dtype;
+  static const Op& op = Op::Get(op_name);
+  return Call(op, {data, offset, weight}, Attrs{attrs}, {});
+}
+
 
 // relay.nn.conv1d
 TVM_REGISTER_NODE_TYPE(Conv1DAttrs);
@@ -193,6 +225,7 @@ with the layer input to produce a tensor of outputs.
 .add_type_rel("Conv2D", Conv2DRel<Conv2DAttrs>)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
 
+
 // relay.nn.conv3d
 TVM_REGISTER_NODE_TYPE(Conv3DAttrs);
 
@@ -238,141 +271,31 @@ with the layer input to produce a tensor of outputs.
 .add_type_rel("Conv3D", Conv3DRel<Conv3DAttrs>)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv3DAttrs>);
 
+
 // relay.nn.conv2d_transpose
 TVM_REGISTER_NODE_TYPE(Conv2DTransposeAttrs);
 
-bool Conv2DTransposeRel(const Array<Type>& types,
-                        int num_inputs,
-                        const Attrs& attrs,
-                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
-  const auto* data = types[0].as<TensorTypeNode>();
-  const auto* weight = types[1].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  static const Layout kNCHW("NCHW");
-  static const Layout kOIHW("OIHW");
-
-  const Conv2DTransposeAttrs* param = attrs.as<Conv2DTransposeAttrs>();
-  CHECK(param != nullptr);
-  const Layout in_layout(param->data_layout);
-  const Layout kernel_layout(param->kernel_layout);
-
-  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
-    << "Conv only support input layouts that are convertible from NCHW."
-    << " But got " << in_layout;
-
-  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
-    << "Conv only support kernel layouts that are convertible from OIHW."
-    << " But got "<< kernel_layout;
-
-  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
-    << "Conv only support output layouts that are convertible from NCHW."
-    << " But got " << out_layout;
-
-  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
-
-  auto dshape_nchw = trans_in_layout.ForwardShape(data->shape);
-
-  // infer weight if the kernel_size and channels are defined
-  if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
-
-    Array<IndexExpr> wshape({dshape_nchw[1],
-            indexdiv(param->channels, param->groups),
-            param->kernel_size[0],
-            param->kernel_size[1]});
-
-    wshape = trans_kernel_layout.BackwardShape(wshape);
-    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
-    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
-    channels = param->channels;
-
-    // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, data->dtype));
-  } else {
-    // use weight to infer the conv shape.
-    if (weight == nullptr) return false;
-    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
-    if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 2);
-      // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
-          << "Conv2D: shape of weight is inconsistent with kernel_size, "
-          << " kernel_size=" << param->kernel_size
-          << " wshape=" << Array<IndexExpr>(wshape);
-    }
-    if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
-          << "Conv2D: shape of weight is inconsistent with channels, "
-          << " channels=" << param->channels
-          << " wshape=" << Array<IndexExpr>(wshape);
-    }
-    CHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[0]));
-    channels = wshape[1];
-    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
-    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
-  }
-  // dilation
-  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
-  IndexExpr pad_h, pad_w;
-  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
-  oshape.Set(2, (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
-                 pad_h + param->output_padding[0]));
-  oshape.Set(3, (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
-                 pad_w + param->output_padding[1]));
-
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
-  oshape = trans_out_layout.BackwardShape(oshape);
-  reporter->Assign(types[2], TensorType(oshape, out_dtype));
-  return true;
-}
-
-
-Expr MakeConv2DTranspose(Expr data,
-                         Expr weight,
-                         Array<IndexExpr> strides,
-                         Array<IndexExpr> padding,
-                         Array<IndexExpr> dilation,
-                         int groups,
-                         IndexExpr channels,
-                         Array<IndexExpr> kernel_size,
-                         std::string data_layout,
-                         std::string kernel_layout,
-                         std::string out_layout,
-                         Array<IndexExpr> output_padding,
-                         DataType out_dtype) {
-  auto attrs = make_object<Conv2DTransposeAttrs>();
-  attrs->channels = std::move(channels);
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->output_padding = std::move(output_padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.conv2d_transpose");
-  return Call(op, {data, weight}, Attrs(attrs), {});
-}
-
-
 TVM_REGISTER_GLOBAL("relay.op.nn._make.conv2d_transpose")
-.set_body_typed(MakeConv2DTranspose);
+.set_body_typed([](Expr data,
+                   Expr weight,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   Array<IndexExpr> dilation,
+                   int groups,
+                   IndexExpr channels,
+                   Array<IndexExpr> kernel_size,
+                   std::string data_layout,
+                   std::string kernel_layout,
+                   std::string out_layout,
+                   DataType out_dtype) {
+  return MakeConv<Conv2DTransposeAttrs>(
+    data, weight, strides, padding, dilation,
+    groups, channels, kernel_size, data_layout,
+    kernel_layout, out_layout, out_dtype, "nn.conv2d_transpose");
+});
 
 RELAY_REGISTER_OP("nn.conv2d_transpose")
-.describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
+    .describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
 
 The need for transposed convolutions generally arises
 from the desire to use a transformation going in the opposite direction
@@ -393,145 +316,39 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
                 out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv2DTransposeAttrs>()
-.set_num_inputs(2)
-.add_argument("data", "Tensor", "The input tensor.")
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(2)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                               ConvInferCorrectLayout<Conv2DTransposeAttrs>)
-.add_type_rel("Conv2DTranspose", Conv2DTransposeRel);
-
+    .set_attrs_type<Conv2DTransposeAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(2)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                   ConvInferCorrectLayout<Conv2DTransposeAttrs>)
+    .add_type_rel("Conv2DTranspose", Conv2DTransposeRel<Conv2DTransposeAttrs>);
 
 // relay.nn.conv1d_transpose
 TVM_REGISTER_NODE_TYPE(Conv1DTransposeAttrs);
 
-bool Conv1DTransposeRel(const Array<Type>& types,
-                        int num_inputs,
-                        const Attrs& attrs,
-                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
-  const auto* data = types[0].as<TensorTypeNode>();
-  const auto* weight = types[1].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  static const Layout kNCW("NCW");
-  static const Layout kOIW("OIW");
-
-  const Conv1DTransposeAttrs* param = attrs.as<Conv1DTransposeAttrs>();
-  CHECK(param != nullptr);
-  const Layout in_layout(param->data_layout);
-  const Layout kernel_layout(param->kernel_layout);
-
-  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCW);
-  CHECK(trans_in_layout.defined())
-    << "Conv only support input layouts that are convertible from NCW."
-    << " But got " << in_layout;
-
-  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIW);
-  CHECK(trans_kernel_layout.defined())
-    << "Conv only support kernel layouts that are convertible from OIW."
-    << " But got "<< kernel_layout;
-
-  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCW);
-  CHECK(trans_out_layout.defined())
-    << "Conv only support output layouts that are convertible from NCW."
-    << " But got " << out_layout;
-
-  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
-
-  auto dshape_ncw = trans_in_layout.ForwardShape(data->shape);
-
-  // infer weight if the kernel_size and channels are defined
-  if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 1);
-    CHECK_EQ(param->dilation.size(), 1);
-
-    Array<IndexExpr> wshape({dshape_ncw[1],
-            indexdiv(param->channels, param->groups),
-            param->kernel_size[0]});
-
-    wshape = trans_kernel_layout.BackwardShape(wshape);
-    dilated_ksize_x = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
-    channels = param->channels;
-
-    // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, data->dtype));
-  } else {
-    // use weight to infer the conv shape.
-    if (weight == nullptr) return false;
-    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
-    if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 1);
-      // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
-          << "Conv1D: shape of weight is inconsistent with kernel_size, "
-          << " kernel_size=" << param->kernel_size
-          << " wshape=" << Array<IndexExpr>(wshape);
-    }
-    if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
-          << "Conv1D: shape of weight is inconsistent with channels, "
-          << " channels=" << param->channels
-          << " wshape=" << Array<IndexExpr>(wshape);
-    }
-    CHECK(reporter->AssertEQ(indexdiv(dshape_ncw[1], param->groups), wshape[0]));
-    channels = wshape[1];
-    dilated_ksize_x = 1 + (wshape[2] - 1) * param->dilation[0];
-  }
-  // dilation
-  IndexExpr pad_w;
-  GetPaddingWidth(param->padding, &pad_w);
-  Array<IndexExpr> oshape({dshape_ncw[0], channels, 0});
-  oshape.Set(2, (param->strides[0] * (dshape_ncw[2] - 1) + dilated_ksize_x -
-                 pad_w + param->output_padding[0]));
-
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
-  oshape = trans_out_layout.BackwardShape(oshape);
-  reporter->Assign(types[2], TensorType(oshape, out_dtype));
-  return true;
-}
-
-
-Expr MakeConv1DTranspose(Expr data,
-                         Expr weight,
-                         Array<IndexExpr> strides,
-                         Array<IndexExpr> padding,
-                         Array<IndexExpr> dilation,
-                         int groups,
-                         IndexExpr channels,
-                         Array<IndexExpr> kernel_size,
-                         std::string data_layout,
-                         std::string kernel_layout,
-                         std::string out_layout,
-                         Array<IndexExpr> output_padding,
-                         DataType out_dtype) {
-  auto attrs = make_object<Conv1DTransposeAttrs>();
-  attrs->channels = std::move(channels);
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->output_padding = std::move(output_padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.conv1d_transpose");
-  return Call(op, {data, weight}, Attrs(attrs), {});
-}
-
-
 TVM_REGISTER_GLOBAL("relay.op.nn._make.conv1d_transpose")
-.set_body_typed(MakeConv1DTranspose);
+.set_body_typed([](Expr data,
+                   Expr weight,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   Array<IndexExpr> dilation,
+                   int groups,
+                   IndexExpr channels,
+                   Array<IndexExpr> kernel_size,
+                   std::string data_layout,
+                   std::string kernel_layout,
+                   std::string out_layout,
+                   DataType out_dtype) {
+  return MakeConv<Conv1DTransposeAttrs>(
+    data, weight, strides, padding, dilation,
+    groups, channels, kernel_size, data_layout,
+    kernel_layout, out_layout, out_dtype, "nn.conv1d_transpose");
+});
 
 RELAY_REGISTER_OP("nn.conv1d_transpose")
-.describe(R"code(Transposed 1D convolution layer (sometimes called Deconvolution).
+    .describe(R"code(Transposed 1D convolution layer (sometimes called Deconvolution).
 
 The need for transposed convolutions generally arises
 from the desire to use a transformation going in the opposite direction
@@ -551,13 +368,12 @@ said convolution.
                 out_width = (width-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv1DTransposeAttrs>()
-.set_num_inputs(2)
-.add_argument("data", "Tensor", "The input tensor.")
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(2)
-.add_type_rel("Conv1DTranspose", Conv1DTransposeRel);
-
+    .set_attrs_type<Conv1DTransposeAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(2)
+    .add_type_rel("Conv1DTranspose", Conv1DTransposeRel<Conv1DTransposeAttrs>);
 
 // relay.nn.contrib_conv2d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs);
@@ -615,19 +431,19 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_weight_transform"
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
-.describe(R"code(Weight transformation of winograd fast convolution algorithm.
+    .describe(R"code(Weight transformation of winograd fast convolution algorithm.
 
 Separate this into another operator in order to enable Precompute Pass to compute the
 weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 )code" TVM_ADD_FILELINE)
-.set_attrs_type<ConvWinogradWeightTransformAttrs>()
-.set_num_inputs(1)
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(10)
-.add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
-
+    .set_attrs_type<ConvWinogradWeightTransformAttrs>()
+    .set_num_inputs(1)
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv2DWinogradWeightTransform",
+                  Conv2DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
 
 // relay.nn.contrib_conv3d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv3DWinogradAttrs);
@@ -652,9 +468,8 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_without_weight_tr
     kernel_layout, out_layout, out_dtype, "nn.contrib.conv3d_winograd_without_weight_tranform");
 });
 
-
 RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
-.describe(R"code(Compute conv3d with winograd algorithm. Only supports NCDHW layout.
+    .describe(R"code(Compute conv3d with winograd algorithm. Only supports NCDHW layout.
                  This operator assumes the weight tensor is already pre-transformed by
                  nn.contrib_conv3d_winograd_weight_transform.
 
@@ -665,15 +480,14 @@ RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
 
 - **out**:  Output is 5D array of shape (batch_size, channels, depth, out_height, out_width)
 )code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv3DWinogradAttrs>()
-.set_num_inputs(2)
-.add_argument("data", "Tensor", "The input tensor.")
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(10)
-.add_type_rel("Conv3DWinograd", Conv3DWinogradRel<Conv3DWinogradAttrs>)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-        ConvInferCorrectLayout<Conv3DWinogradAttrs>);
-
+    .set_attrs_type<Conv3DWinogradAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv3DWinograd", Conv3DWinogradRel<Conv3DWinogradAttrs>)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                   ConvInferCorrectLayout<Conv3DWinogradAttrs>);
 
 // relay.nn.contrib_conv3d_winograd_weight_transform
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform")
@@ -684,54 +498,23 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform"
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_weight_transform")
-.describe(R"code(Weight transformation of winograd fast 3d convolution algorithm.
+    .describe(R"code(Weight transformation of winograd fast 3d convolution algorithm.
 
 Separate this into another operator in order to enable Precompute Pass to compute the
 weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2])
 )code" TVM_ADD_FILELINE)
-.set_attrs_type<ConvWinogradWeightTransformAttrs>()
-.set_num_inputs(1)
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(10)
-.add_type_rel("Conv3DWinogradWeightTransform", Conv3DWinogradWeightTransformRel);
-
+    .set_attrs_type<ConvWinogradWeightTransformAttrs>()
+    .set_num_inputs(1)
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv3DWinogradWeightTransform",
+                  Conv3DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
 
 // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs);
 
-bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types,
-                                            int num_inputs,
-                                            const Attrs& attrs,
-                                            const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) {
-    return false;
-  }
-
-  const Conv2DWinogradNNPACKWeightTransformAttrs* param =
-      attrs.as<Conv2DWinogradNNPACKWeightTransformAttrs>();
-  CHECK(param != nullptr);
-
-  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
-
-  std::vector<IndexExpr> oshape{
-      data->shape[0],
-      data->shape[1],
-      8,
-      8,
-  };
-
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
-  reporter->Assign(types[1], TensorType(Array<IndexExpr>(oshape), out_dtype));
-  return true;
-}
-
 Expr MakeConv2DWinogradNNPACKWeightTransform(Expr weight,
                                              int convolution_algorithm,
                                              DataType out_dtype) {
@@ -746,50 +529,40 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_nnpack_weight_tra
 .set_body_typed(MakeConv2DWinogradNNPACKWeightTransform);
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_weight_transform")
-.describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK.
+    .describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK.
 Separate this into another symbol in order to enable Precompute Pass to compute the
 weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv2DWinogradNNPACKWeightTransformAttrs>()
-.set_num_inputs(1)
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(10)
-.add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
+    .set_attrs_type<Conv2DWinogradNNPACKWeightTransformAttrs>()
+    .set_num_inputs(1)
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv2DWinogradNNPACKWeightTransform",
+                  Conv2DWinogradNNPACKWeightTransformRel<Conv2DWinogradNNPACKWeightTransformAttrs>);
 
 // Positional relay function to create conv2d NCHWc operator
 // used by frontend FFI.
-Expr MakeConv2DNCHWc(Expr data,
-                     Expr kernel,
-                     Array<IndexExpr> strides,
-                     Array<IndexExpr> padding,
-                     Array<IndexExpr> dilation,
-                     int groups,
-                     IndexExpr channels,
-                     Array<IndexExpr> kernel_size,
-                     std::string data_layout,
-                     std::string kernel_layout,
-                     std::string out_layout,
-                     DataType out_dtype) {
-  auto attrs = make_object<Conv2DAttrs>();
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->channels = channels;
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.contrib_conv2d_NCHWc");
-  return Call(op, {data, kernel}, Attrs(attrs), {});
-}
-
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_NCHWc")
-.set_body_typed(MakeConv2DNCHWc);
+.set_body_typed([](Expr data,
+                   Expr weight,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   Array<IndexExpr> dilation,
+                   int groups,
+                   IndexExpr channels,
+                   Array<IndexExpr> kernel_size,
+                   std::string data_layout,
+                   std::string kernel_layout,
+                   std::string out_layout,
+                   DataType out_dtype) {
+  return MakeConv<Conv2DAttrs>(
+    data, weight, strides, padding, dilation,
+    groups, channels, kernel_size, data_layout,
+    kernel_layout, out_layout, out_dtype, "nn.contrib_conv2d_NCHWc");
+});
 
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
@@ -811,35 +584,24 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
 
 // Positional relay function to create depthwise conv2d NCHWc operator
 // used by frontend FFI.
-Expr MakeDepthwiseConv2DNCHWc(Expr data,
-                              Expr kernel,
-                              Array<IndexExpr> strides,
-                              Array<IndexExpr> padding,
-                              Array<IndexExpr> dilation,
-                              int groups,
-                              IndexExpr channels,
-                              Array<IndexExpr> kernel_size,
-                              std::string data_layout,
-                              std::string kernel_layout,
-                              std::string out_layout,
-                              DataType out_dtype) {
-  auto attrs = make_object<Conv2DAttrs>();
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->channels = channels;
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.contrib_depthwise_conv2d_NCHWc");
-  return Call(op, {data, kernel}, Attrs(attrs), {});
-}
-
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_depthwise_conv2d_NCHWc")
-.set_body_typed(MakeDepthwiseConv2DNCHWc);
+.set_body_typed([](Expr data,
+                   Expr weight,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   Array<IndexExpr> dilation,
+                   int groups,
+                   IndexExpr channels,
+                   Array<IndexExpr> kernel_size,
+                   std::string data_layout,
+                   std::string kernel_layout,
+                   std::string out_layout,
+                   DataType out_dtype) {
+  return MakeConv<Conv2DAttrs>(
+    data, weight, strides, padding, dilation,
+    groups, channels, kernel_size, data_layout,
+    kernel_layout, out_layout, out_dtype, "nn.contrib_depthwise_conv2d_NCHWc");
+});
 
 
 RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
@@ -970,38 +732,26 @@ by concating all the *g* results.
 
 // Positional relay function to create deformable_conv2d operator
 // used by frontend FFI.
-Expr MakeDeformableConv2D(Expr data,
-                          Expr offset,
-                          Expr weight,
-                          Array<IndexExpr> strides,
-                          Array<IndexExpr> padding,
-                          Array<IndexExpr> dilation,
-                          int deformable_groups,
-                          int groups,
-                          int channels,
-                          Array<IndexExpr> kernel_size,
-                          std::string data_layout,
-                          std::string kernel_layout,
-                          std::string out_layout,
-                          DataType out_dtype) {
-  auto attrs = make_object<DeformableConv2DAttrs>();
-  attrs->strides = strides;
-  attrs->padding = padding;
-  attrs->dilation = dilation;
-  attrs->deformable_groups = deformable_groups;
-  attrs->groups = groups;
-  attrs->channels = channels;
-  attrs->kernel_size = kernel_size;
-  attrs->data_layout = data_layout;
-  attrs->kernel_layout = kernel_layout;
-  attrs->out_layout = out_layout;
-  attrs->out_dtype = out_dtype;
-  static const Op& op = Op::Get("nn.deformable_conv2d");
-  return Call(op, {data, offset, weight}, Attrs{attrs}, {});
-}
-
 TVM_REGISTER_GLOBAL("relay.op.nn._make.deformable_conv2d")
-.set_body_typed(MakeDeformableConv2D);
+.set_body_typed([](Expr data,
+                   Expr offset,
+                   Expr weight,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   Array<IndexExpr> dilation,
+                   int deformable_groups,
+                   int groups,
+                   int channels,
+                   Array<IndexExpr> kernel_size,
+                   std::string data_layout,
+                   std::string kernel_layout,
+                   std::string out_layout,
+                   DataType out_dtype) {
+  return MakeDeformableConv<DeformableConv2DAttrs>(
+    data, offset, weight, strides, padding, dilation,
+    deformable_groups, groups, channels, kernel_size, data_layout,
+    kernel_layout, out_layout, out_dtype, "nn.deformable_conv2d");
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index f3b2f6b04f3f..2e4e0b84726a 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -33,6 +33,8 @@
 namespace tvm {
 namespace relay {
 
+
+// Standard convolution operator shape relations
 template <typename AttrType>
 bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
@@ -360,6 +362,9 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
+
+// Winograd convolution shape relations
+template<typename AttrType>
 bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
                                       int num_inputs,
                                       const Attrs& attrs,
@@ -368,7 +373,7 @@ bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
-  const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
+  const ConvWinogradWeightTransformAttrs* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
 
   CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
@@ -386,7 +391,69 @@ bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
   return true;
 }
 
-template<class Param>
+
+template <typename AttrType>
+bool Conv3DWinogradWeightTransformRel(const Array<Type>& types,
+                                      int num_inputs,
+                                      const Attrs& attrs,
+                                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const ConvWinogradWeightTransformAttrs* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+
+  CHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
+
+  // each pad width element should be a pair of positive integers
+  std::vector<IndexExpr> oshape {
+      param->tile_size + data->shape[2] - 1,
+      param->tile_size + data->shape[3] - 1,
+      param->tile_size + data->shape[4] - 1,
+      data->shape[0],
+      data->shape[1],
+  };
+
+  reporter->Assign(types[1], TensorType(Array<IndexExpr>(oshape),
+                                                  data->dtype));
+  return true;
+}
+
+
+template <typename AttrType>
+bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types,
+                                            int num_inputs,
+                                            const Attrs& attrs,
+                                            const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+
+  const Conv2DWinogradNNPACKWeightTransformAttrs* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+
+  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+
+  std::vector<IndexExpr> oshape{
+      data->shape[0],
+      data->shape[1],
+      8,
+      8,
+  };
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  reporter->Assign(types[1], TensorType(Array<IndexExpr>(oshape), out_dtype));
+  return true;
+}
+
+
+template<typename AttrType>
 bool Conv2DWinogradRel(const Array<Type>& types,
                        int num_inputs,
                        const Attrs& attrs,
@@ -397,7 +464,7 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   static const Layout kNCHW("NCHW");
   static const Layout kOIHW("OIHW");
 
-  const Param* param = attrs.as<Param>();
+  const AttrType* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
@@ -467,51 +534,53 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   return true;
 }
 
-template<class Param>
-bool Conv2DWinogradRel(const Array<Type>& types,
+
+template<typename AttrType>
+bool Conv3DWinogradRel(const Array<Type>& types,
                        int num_inputs,
                        const Attrs& attrs,
                        const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  static const Layout kNCHW("NCHW");
-  static const Layout kOIHW("OIHW");
+  static const Layout kNCDHW("NCDHW");
+  static const Layout kOIDHW("OIDHW");
 
-  const Param* param = attrs.as<Param>();
+  const AttrType* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
-  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
+  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCDHW);
   CHECK(trans_in_layout.defined())
-    << "Conv only support input layouts that are convertible from NCHW."
+    << "Conv only support input layouts that are convertible from NCDHW."
     << " But got " << in_layout;
 
-  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
+  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIDHW);
   CHECK(trans_kernel_layout.defined())
-    << "Conv only support kernel layouts that are convertible from OIHW."
+    << "Conv only support kernel layouts that are convertible from OIDHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
+  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCDHW);
   CHECK(trans_out_layout.defined())
-      << "Conv only support output layouts that are convertible from NCHW."
+      << "Conv only support output layouts that are convertible from NCDHW."
       << " But got " << out_layout;
 
-  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
+  Array<IndexExpr> dshape_ncdhw = trans_in_layout.ForwardShape(data->shape);
 
-  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+  IndexExpr channels, dilated_ksize_d, dilated_ksize_y, dilated_ksize_x;
 
   CHECK(param->kernel_size.defined() && param->channels.defined())
       << "The kernel size and channels of a Conv must be set or infered by previous pass";
 
-  CHECK_EQ(param->kernel_size.size(), 2);
-  CHECK_EQ(param->dilation.size(), 2);
+  CHECK_EQ(param->kernel_size.size(), 3);
+  CHECK_EQ(param->dilation.size(), 3);
 
   channels = param->channels;
-  dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
-  dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+  dilated_ksize_d = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+  dilated_ksize_y = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+  dilated_ksize_x = 1 + (param->kernel_size[2] - 1) * param->dilation[2];
 
   // NOTE: Do not check weight shape here!
   // Different backend requires different layout to compute
@@ -521,21 +590,27 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   // can handle this correctly in alter_op_layout.
 
   // dilation
-  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  Array<IndexExpr> oshape({dshape_ncdhw[0], channels, 0, 0, 0});
 
-  IndexExpr pad_h, pad_w;
-  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
-  if (!dshape_nchw[2].as<tir::AnyNode>()) {
-    oshape.Set(2, (dshape_nchw[2] + pad_h
-                   - dilated_ksize_y) / param->strides[0] + 1);
+  IndexExpr pad_d, pad_h, pad_w;
+  GetPaddingDepthHeightWidth(param->padding, &pad_d, &pad_h, &pad_w);
+  if (!dshape_ncdhw[2].as<tir::AnyNode>()) {
+    oshape.Set(2, (dshape_ncdhw[2] + pad_d
+                   - dilated_ksize_d) / param->strides[0] + 1);
   } else {
-    oshape.Set(2, dshape_nchw[2]);
+    oshape.Set(2, dshape_ncdhw[2]);
   }
-  if (!dshape_nchw[3].as<tir::AnyNode>()) {
-    oshape.Set(3, (dshape_nchw[3] + pad_w
-                   - dilated_ksize_x) / param->strides[1] + 1);
+  if (!dshape_ncdhw[2].as<tir::AnyNode>()) {
+    oshape.Set(3, (dshape_ncdhw[3] + pad_h
+                   - dilated_ksize_y) / param->strides[1] + 1);
   } else {
-    oshape.Set(3, dshape_nchw[3]);
+    oshape.Set(3, dshape_ncdhw[3]);
+  }
+  if (!dshape_ncdhw[4].as<tir::AnyNode>()) {
+    oshape.Set(4, (dshape_ncdhw[4] + pad_w
+                   - dilated_ksize_x) / param->strides[2] + 1);
+  } else {
+    oshape.Set(4, dshape_ncdhw[4]);
   }
 
   DataType out_dtype = param->out_dtype;
@@ -548,6 +623,198 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   return true;
 }
 
+
+// Transposed convolution shape relations
+template <typename AttrType>
+bool Conv1DTransposeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCW("NCW");
+  static const Layout kOIW("OIW");
+
+  const Conv1DTransposeAttrs* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->kernel_layout);
+
+  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCW);
+  CHECK(trans_in_layout.defined())
+    << "Conv only support input layouts that are convertible from NCW."
+    << " But got " << in_layout;
+
+  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIW);
+  CHECK(trans_kernel_layout.defined())
+    << "Conv only support kernel layouts that are convertible from OIW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
+  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCW);
+  CHECK(trans_out_layout.defined())
+    << "Conv only support output layouts that are convertible from NCW."
+    << " But got " << out_layout;
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+
+  auto dshape_ncw = trans_in_layout.ForwardShape(data->shape);
+
+  // infer weight if the kernel_size and channels are defined
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 1);
+    CHECK_EQ(param->dilation.size(), 1);
+
+    Array<IndexExpr> wshape({dshape_ncw[1],
+            indexdiv(param->channels, param->groups),
+            param->kernel_size[0]});
+
+    wshape = trans_kernel_layout.BackwardShape(wshape);
+    dilated_ksize_x = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    channels = param->channels;
+
+    // assign result to reporter
+    reporter->Assign(types[1], TensorType(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 1);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
+          << "Conv1D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+          << "Conv1D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    CHECK(reporter->AssertEQ(indexdiv(dshape_ncw[1], param->groups), wshape[0]));
+    channels = wshape[1];
+    dilated_ksize_x = 1 + (wshape[2] - 1) * param->dilation[0];
+  }
+  // dilation
+  IndexExpr pad_w;
+  GetPaddingWidth(param->padding, &pad_w);
+  Array<IndexExpr> oshape({dshape_ncw[0], channels, 0});
+  oshape.Set(2, (param->strides[0] * (dshape_ncw[2] - 1) + dilated_ksize_x -
+                 pad_w + param->output_padding[0]));
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = trans_out_layout.BackwardShape(oshape);
+  reporter->Assign(types[2], TensorType(oshape, out_dtype));
+  return true;
+}
+
+
+template <typename AttrType>
+bool Conv2DTransposeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Conv2DTransposeAttrs* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->kernel_layout);
+
+  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+
+  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
+  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+
+  auto dshape_nchw = trans_in_layout.ForwardShape(data->shape);
+
+  // infer weight if the kernel_size and channels are defined
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 2);
+    CHECK_EQ(param->dilation.size(), 2);
+
+    Array<IndexExpr> wshape({dshape_nchw[1],
+            indexdiv(param->channels, param->groups),
+            param->kernel_size[0],
+            param->kernel_size[1]});
+
+    wshape = trans_kernel_layout.BackwardShape(wshape);
+    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+    channels = param->channels;
+
+    // assign result to reporter
+    reporter->Assign(types[1], TensorType(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 2);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+          << "Conv2D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+          << "Conv2D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    CHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[0]));
+    channels = wshape[1];
+    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
+  }
+  // dilation
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  IndexExpr pad_h, pad_w;
+  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
+  oshape.Set(2, (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
+                 pad_h + param->output_padding[0]));
+  oshape.Set(3, (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
+                 pad_w + param->output_padding[1]));
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = trans_out_layout.BackwardShape(oshape);
+  reporter->Assign(types[2], TensorType(oshape, out_dtype));
+  return true;
+}
+
+
 template<typename T>
 Array<Array<Layout> > ConvInferCorrectLayout(
     const Attrs& attrs,

From f03c5dda3203978945ecad1d1420a44a06e17324 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Sun, 29 Mar 2020 17:38:20 -0700
Subject: [PATCH 09/28] Bug fixes.

---
 python/tvm/relay/op/op_attrs.py      |  5 +++
 python/tvm/relay/op/strategy/cuda.py |  1 +
 src/relay/op/nn/convolution.cc       | 49 +++++++++++++++++++++++-----
 3 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 26a76489bcd4..1a07486cd095 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -34,6 +34,11 @@ class Conv2DWinogradAttrs(Attrs):
     """Attributes for nn.contrib_conv2d_winograd_without_weight_transform"""
 
 
+@tvm._ffi.register_object("relay.attrs.Conv3DAttrs")
+class Conv3DAttrs(Attrs):
+    """Attributes for nn.conv3d"""
+
+
 @tvm._ffi.register_object("relay.attrs.Conv3DWinogradAttrs")
 class Conv3DWinogradAttrs(Attrs):
     """Attributes for nn.contrib_conv3d_winograd_without_weight_transform"""
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index c3b2aef6b96b..3e3bc36693e8 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -233,6 +233,7 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 def conv3d_strategy_cuda(attrs, inputs, out_type, target):
     """conv3d cuda strategy"""
     strategy = _op.OpStrategy()
+    data, kernel = inputs
     layout = attrs.data_layout
     stride_d, stride_h, stride_w = attrs.get_int_tuple("strides")
     dilation_d, dilation_h, dilation_w = attrs.get_int_tuple("dilation")                                   
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 7cfd85747b47..858633041803 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -103,6 +103,37 @@ Expr MakeConvWinogradWeightTransform(Expr weight,
   return Call(op, {weight}, Attrs(attrs), {});
 }
 
+template <typename T>
+Expr MakeConvTranspose(Expr data,
+              Expr weight,
+              Array<IndexExpr> strides,
+              Array<IndexExpr> padding,
+              Array<IndexExpr> dilation,
+              int groups,
+              IndexExpr channels,
+              Array<IndexExpr> kernel_size,
+              std::string data_layout,
+              std::string kernel_layout,
+              std::string out_layout,
+              Array<IndexExpr> output_padding,
+              DataType out_dtype,
+              std::string op_name) {
+  auto attrs = make_object<T>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = std::move(channels);
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->output_padding = std::move(output_padding);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get(op_name);
+  return Call(op, {data, weight}, Attrs(attrs), {});
+}
+
 template <typename T>
 Expr MakeDeformableConv(Expr data,
                         Expr offset,
@@ -287,11 +318,12 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.conv2d_transpose")
                    std::string data_layout,
                    std::string kernel_layout,
                    std::string out_layout,
+                   Array<IndexExpr> output_padding,
                    DataType out_dtype) {
-  return MakeConv<Conv2DTransposeAttrs>(
+  return MakeConvTranspose<Conv2DTransposeAttrs>(
     data, weight, strides, padding, dilation,
     groups, channels, kernel_size, data_layout,
-    kernel_layout, out_layout, out_dtype, "nn.conv2d_transpose");
+    kernel_layout, out_layout, output_padding, out_dtype, "nn.conv2d_transpose");
 });
 
 RELAY_REGISTER_OP("nn.conv2d_transpose")
@@ -340,11 +372,12 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.conv1d_transpose")
                    std::string data_layout,
                    std::string kernel_layout,
                    std::string out_layout,
+                   Array<IndexExpr> output_padding,
                    DataType out_dtype) {
-  return MakeConv<Conv1DTransposeAttrs>(
+  return MakeConvTranspose<Conv1DTransposeAttrs>(
     data, weight, strides, padding, dilation,
     groups, channels, kernel_size, data_layout,
-    kernel_layout, out_layout, out_dtype, "nn.conv1d_transpose");
+    kernel_layout, out_layout, output_padding, out_dtype, "nn.conv1d_transpose");
 });
 
 RELAY_REGISTER_OP("nn.conv1d_transpose")
@@ -395,7 +428,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_without_weight_tr
   return MakeConvWinograd<Conv2DWinogradAttrs>(
     data, weight, tile_size, strides, padding, dilation,
     groups, channels, kernel_size, data_layout,
-    kernel_layout, out_layout, out_dtype, "nn.contrib.conv2d_winograd_without_weight_tranform");
+    kernel_layout, out_layout, out_dtype, "nn.contrib_conv2d_winograd_without_weight_transform");
 });
 
 
@@ -427,7 +460,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_weight_transform"
 .set_body_typed([](Expr weight,
                    int tile_size) {
   return MakeConvWinogradWeightTransform(
-    weight, tile_size, "nn.contrib.conv2d_winograd_weight_tranform");
+    weight, tile_size, "nn.contrib_conv2d_winograd_weight_transform");
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
@@ -465,7 +498,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_without_weight_tr
   return MakeConvWinograd<Conv3DWinogradAttrs>(
     data, weight, tile_size, strides, padding, dilation,
     groups, channels, kernel_size, data_layout,
-    kernel_layout, out_layout, out_dtype, "nn.contrib.conv3d_winograd_without_weight_tranform");
+    kernel_layout, out_layout, out_dtype, "nn.contrib.conv3d_winograd_without_weight_transform");
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
@@ -494,7 +527,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform"
 .set_body_typed([](Expr weight,
                    int tile_size) {
   return MakeConvWinogradWeightTransform(
-    weight, tile_size, "nn.contrib.conv3d_winograd_weight_tranform");
+    weight, tile_size, "nn.contrib.conv3d_winograd_weight_transform");
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_weight_transform")

From 47097e6f2be56888757d62acc9c8c26779e0fc83 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Sun, 29 Mar 2020 20:00:45 -0700
Subject: [PATCH 10/28] Fixed static bug in convolution.

---
 src/relay/op/nn/convolution.cc | 120 ++++++---------------------------
 src/relay/op/nn/convolution.h  |  82 ++++++++++++++++++++++
 2 files changed, 103 insertions(+), 99 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 858633041803..c31ebaadb602 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -59,7 +59,7 @@ Expr MakeConv(Expr data,
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = std::move(out_layout);
   attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get(op_name);
+  const Op& op = Op::Get(op_name);
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
 
@@ -90,7 +90,7 @@ Expr MakeConvWinograd(Expr data,
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = std::move(out_layout);
   attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get(op_name);
+  const Op& op = Op::Get(op_name);
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
 
@@ -99,25 +99,25 @@ Expr MakeConvWinogradWeightTransform(Expr weight,
                                      std::string op_name) {
   auto attrs = make_object<ConvWinogradWeightTransformAttrs>();
   attrs->tile_size = tile_size;
-  static const Op& op = Op::Get(op_name);
+  const Op& op = Op::Get(op_name);
   return Call(op, {weight}, Attrs(attrs), {});
 }
 
 template <typename T>
 Expr MakeConvTranspose(Expr data,
-              Expr weight,
-              Array<IndexExpr> strides,
-              Array<IndexExpr> padding,
-              Array<IndexExpr> dilation,
-              int groups,
-              IndexExpr channels,
-              Array<IndexExpr> kernel_size,
-              std::string data_layout,
-              std::string kernel_layout,
-              std::string out_layout,
-              Array<IndexExpr> output_padding,
-              DataType out_dtype,
-              std::string op_name) {
+                       Expr weight,
+                       Array<IndexExpr> strides,
+                       Array<IndexExpr> padding,
+                       Array<IndexExpr> dilation,
+                       int groups,
+                       IndexExpr channels,
+                       Array<IndexExpr> kernel_size,
+                       std::string data_layout,
+                       std::string kernel_layout,
+                       std::string out_layout,
+                       Array<IndexExpr> output_padding,
+                       DataType out_dtype,
+                       std::string op_name) {
   auto attrs = make_object<T>();
   attrs->strides = std::move(strides);
   attrs->padding = std::move(padding);
@@ -130,7 +130,7 @@ Expr MakeConvTranspose(Expr data,
   attrs->out_layout = std::move(out_layout);
   attrs->output_padding = std::move(output_padding);
   attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get(op_name);
+  const Op& op = Op::Get(op_name);
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
 
@@ -162,7 +162,7 @@ Expr MakeDeformableConv(Expr data,
   attrs->kernel_layout = kernel_layout;
   attrs->out_layout = out_layout;
   attrs->out_dtype = out_dtype;
-  static const Op& op = Op::Get(op_name);
+  const Op& op = Op::Get(op_name);
   return Call(op, {data, offset, weight}, Attrs{attrs}, {});
 }
 
@@ -545,6 +545,7 @@ weight transformation in advance.
     .add_type_rel("Conv3DWinogradWeightTransform",
                   Conv3DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
 
+
 // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs);
 
@@ -576,6 +577,7 @@ weight transformation in advance.
     .add_type_rel("Conv2DWinogradNNPACKWeightTransform",
                   Conv2DWinogradNNPACKWeightTransformRel<Conv2DWinogradNNPACKWeightTransformAttrs>);
 
+
 // Positional relay function to create conv2d NCHWc operator
 // used by frontend FFI.
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_NCHWc")
@@ -597,7 +599,6 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_NCHWc")
     kernel_layout, out_layout, out_dtype, "nn.contrib_conv2d_NCHWc");
 });
 
-
 RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
 .describe(R"code(Compute conv2d with NCHWc data layout. Only supports NCHW layout.
 - **data**: Input is 5D packed tensor.
@@ -654,85 +655,6 @@ RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
         ConvInferCorrectLayout<Conv2DAttrs>);
 
 
-bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
-                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
-  const auto* data = types[0].as<TensorTypeNode>();
-  const auto* weight = types[2].as<TensorTypeNode>();
-
-  CHECK(data);
-  auto* param = attrs.as<DeformableConv2DAttrs>();
-  CHECK_EQ(param->data_layout, "NCHW") << "data layout not supported.";
-  CHECK_EQ(param->kernel_layout, "OIHW") << "kernel_layout not supported.";
-
-  IndexExpr channels, dilated_ksize_y, dilated_ksize_x, ksize_y, ksize_x;
-
-  // infer weight shape if kernel_size and channels are defiend
-  if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
-    Array<IndexExpr> wshape(
-       {param->channels,
-         indexdiv(data->shape[1], param->groups),
-         param->kernel_size[0],
-         param->kernel_size[1]});
-    channels = param->channels;
-    ksize_y = param->kernel_size[0];
-    ksize_x = param->kernel_size[1];
-    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
-    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
-    // assign result to reporter
-    reporter->Assign(types[2], TensorType(wshape, data->dtype));
-  } else {
-    // use weight to infer the conv shape.
-    if (weight == nullptr) return false;
-    auto wshape = weight->shape;
-    if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 2);
-      // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
-          << "DeformableConv2D: shape of weight is inconsistent with kernel_size, "
-          << " kernel_size=" << param->kernel_size
-          << " wshape=" << wshape;
-    }
-    if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
-          << "DeformableConv2D: shape of weight is inconsistent with channels, "
-          << " channels=" << param->channels
-          << " wshape=" << wshape;
-    }
-    CHECK(reporter->AssertEQ(indexdiv(data->shape[1], param->groups), wshape[1]));
-    channels = wshape[0];
-    ksize_y = wshape[2];
-    ksize_x = wshape[3];
-    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
-    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
-  }
-  // dilation
-  Array<IndexExpr> oshape({data->shape[0], channels, 0, 0});
-
-  IndexExpr pad_h, pad_w;
-  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
-  oshape.Set(2, indexdiv(data->shape[2] + pad_h - dilated_ksize_y,
-                         param->strides[0]) + 1);
-  oshape.Set(3, indexdiv(data->shape[3] + pad_w - dilated_ksize_x,
-                         param->strides[1]) + 1);
-  DataType out_dtype = param->out_dtype;
-
-  // infer offset shape
-  Array<IndexExpr> offset_shape({data->shape[0], 2 * ksize_y * ksize_x * param->deformable_groups,
-          oshape[2], oshape[3]});
-  reporter->Assign(types[1], TensorType(offset_shape, data->dtype));
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
-
-  reporter->Assign(types[3], TensorType(oshape, out_dtype));
-  return true;
-}
-
-
 TVM_REGISTER_NODE_TYPE(DeformableConv2DAttrs);
 
 RELAY_REGISTER_OP("nn.deformable_conv2d")
@@ -761,7 +683,7 @@ by concating all the *g* results.
 .add_argument("offset", "Tensor", "The offset tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(5)
-.add_type_rel("DeformableConv2D", DeformableConv2DRel);
+.add_type_rel("DeformableConv2D", DeformableConv2DRel<DeformableConv2DAttrs>);
 
 // Positional relay function to create deformable_conv2d operator
 // used by frontend FFI.
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 2e4e0b84726a..b82a3b16f22e 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -830,6 +830,88 @@ Array<Array<Layout> > ConvInferCorrectLayout(
                                    params->data_layout : params->out_layout}};
 }
 
+
+// Deformable Convolution shape relations.
+template <typename AttrType>
+bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                         const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 4);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[2].as<TensorTypeNode>();
+
+  CHECK(data);
+  auto* param = attrs.as<AttrType>();
+  CHECK_EQ(param->data_layout, "NCHW") << "data layout not supported.";
+  CHECK_EQ(param->kernel_layout, "OIHW") << "kernel_layout not supported.";
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x, ksize_y, ksize_x;
+
+  // infer weight shape if kernel_size and channels are defiend
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 2);
+    CHECK_EQ(param->dilation.size(), 2);
+    Array<IndexExpr> wshape(
+       {param->channels,
+         indexdiv(data->shape[1], param->groups),
+         param->kernel_size[0],
+         param->kernel_size[1]});
+    channels = param->channels;
+    ksize_y = param->kernel_size[0];
+    ksize_x = param->kernel_size[1];
+    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+    // assign result to reporter
+    reporter->Assign(types[2], TensorType(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = weight->shape;
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 2);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+          << "DeformableConv2D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << wshape;
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+          << "DeformableConv2D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << wshape;
+    }
+    CHECK(reporter->AssertEQ(indexdiv(data->shape[1], param->groups), wshape[1]));
+    channels = wshape[0];
+    ksize_y = wshape[2];
+    ksize_x = wshape[3];
+    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
+  }
+  // dilation
+  Array<IndexExpr> oshape({data->shape[0], channels, 0, 0});
+
+  IndexExpr pad_h, pad_w;
+  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
+  oshape.Set(2, indexdiv(data->shape[2] + pad_h - dilated_ksize_y,
+                         param->strides[0]) + 1);
+  oshape.Set(3, indexdiv(data->shape[3] + pad_w - dilated_ksize_x,
+                         param->strides[1]) + 1);
+  DataType out_dtype = param->out_dtype;
+
+  // infer offset shape
+  Array<IndexExpr> offset_shape({data->shape[0], 2 * ksize_y * ksize_x * param->deformable_groups,
+          oshape[2], oshape[3]});
+  reporter->Assign(types[1], TensorType(offset_shape, data->dtype));
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+
+  reporter->Assign(types[3], TensorType(oshape, out_dtype));
+  return true;
+}
+
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_NN_CONVOLUTION_H_

From 6499491b5b7e8cdcfc93516856bd67a800679019 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Mon, 30 Mar 2020 11:44:46 -0700
Subject: [PATCH 11/28] Added conv3d alter op layout and related support.

---
 python/tvm/relay/op/nn/_nn.py            | 23 ++++++
 python/tvm/relay/op/nn/nn.py             | 97 +++++++++++++++++++++++-
 python/tvm/relay/op/nn/util.py           | 43 +++++++++++
 python/tvm/relay/op/strategy/cuda.py     | 20 +++++
 python/tvm/relay/op/strategy/generic.py  | 13 ++++
 topi/python/topi/cuda/conv3d_alter_op.py | 89 ++++++++++++++++++++++
 topi/python/topi/generic/nn.py           | 29 +++++++
 topi/python/topi/nn/conv3d.py            | 58 +++++++++++++-
 8 files changed, 370 insertions(+), 2 deletions(-)
 create mode 100644 topi/python/topi/cuda/conv3d_alter_op.py

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index aa35fa2e8274..a218d9cf2849 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -178,6 +178,29 @@ def legalize_conv2d_transpose(attrs, inputs, types):
 reg.register_strategy("nn.conv3d", strategy.conv3d_strategy)
 reg.register_pattern("nn.conv3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
+@reg.register_alter_op_layout("nn.conv3d")
+def alter_op_layout_conv3d(attrs, inputs, tinfos, out_type):
+    """Alternate the layout of conv3d"""
+    return topi.nn.conv3d_alter_layout(attrs, inputs, tinfos, out_type)
+
+# conv3d_winograd related operators
+reg.register_strategy("nn.contrib_conv3d_winograd_without_weight_transform",
+                      strategy.conv3d_winograd_without_weight_transfrom_strategy)
+reg.register_pattern("nn.contrib_conv3d_winograd_without_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
+
+@reg.register_compute("nn.contrib_conv3d_winograd_weight_transform")
+def compute_contrib_conv3d_winograd_weight_transform(attrs, inputs, out_dtype):
+    """Compute definition of contrib_conv3d_winograd_weight_transform"""
+    out = topi.nn.conv3d_winograd_weight_transform(
+        inputs[0], attrs.get_int('tile_size'))
+    return [out]
+
+reg.register_schedule("nn.contrib_conv3d_winograd_weight_transform",
+                      strategy.schedule_conv3d_winograd_weight_transform)
+reg.register_pattern("nn.contrib_conv3d_winograd_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
+
 
 # conv1d_transpose
 reg.register_strategy("nn.conv1d_transpose", strategy.conv1d_transpose_strategy)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index c62b1cf3e23c..a46539ef4d06 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import as _abs
 from ...expr import TupleWrapper
 from . import _make
-from .util import get_pad_tuple2d
+from .util import get_pad_tuple2d, get_pad_tuple3d
 
 
 def conv1d(data,
@@ -302,6 +302,78 @@ def conv3d(data,
                         kernel_layout, out_layout, out_dtype)
 
 
+def contrib_conv3d_winograd_without_weight_transform(data,
+                                                     weight,
+                                                     tile_size,
+                                                     strides=(1, 1, 1),
+                                                     padding=(0, 0, 0),
+                                                     dilation=(1, 1, 1),
+                                                     groups=1,
+                                                     channels=None,
+                                                     kernel_size=None,
+                                                     data_layout="NCDHW",
+                                                     kernel_layout="OIDHW",
+                                                     out_layout="",
+                                                     out_dtype=""):
+    r"""3D convolution with winograd algorithm.
+
+    The basic parameters are the same as the ones in vanilla conv3d.
+    It assumes the weight is pre-transformed by nn.contrib_conv3d_winograd_weight_transform
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    tile_size : int
+        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    strides : tuple of int, optional
+        The strides of convolution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # convert 3-way padding to 6-way padding
+    padding = get_pad_tuple3d(padding)
+    return _make.contrib_conv3d_winograd_without_weight_transform(
+        data, weight, tile_size, strides, padding, dilation,
+        groups, channels, kernel_size, data_layout,
+        kernel_layout, out_layout, out_dtype)
+
+
 def conv2d_transpose(data,
                      weight,
                      strides=(1, 1),
@@ -1928,6 +2000,29 @@ def contrib_conv2d_winograd_weight_transform(weight,
     return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size)
 
 
+def contrib_conv3d_winograd_weight_transform(weight,
+                                             tile_size):
+    r"""Weight Transformation part for 3D convolution with winograd algorithm.
+
+    We separate this as a single op to enable pre-compute for inference.
+    Use this together with nn.contrib_conv3d_winograd_without_weight_transform
+
+    Parameters
+    ----------
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    tile_size : int
+        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.contrib_conv3d_winograd_weight_transform(weight, tile_size)
+
+
 def contrib_conv2d_winograd_nnpack_weight_transform(weight,
                                                     convolution_algorithm,
                                                     out_dtype=""):
diff --git a/python/tvm/relay/op/nn/util.py b/python/tvm/relay/op/nn/util.py
index 323ef7f9310e..b2a649db8d69 100644
--- a/python/tvm/relay/op/nn/util.py
+++ b/python/tvm/relay/op/nn/util.py
@@ -54,3 +54,46 @@ def get_pad_tuple2d(padding):
     pad_top = (pad_h + 1) // 2
     pad_left = (pad_w + 1) // 2
     return pad_top, pad_left, pad_h - pad_top, pad_w - pad_left
+
+
+def get_pad_tuple3d(padding):
+    """Common code to get the pad option
+    Parameters
+    ----------
+    padding : Union[int, Tuple[int, ...]]
+        Padding size
+    Returns
+    -------
+    pad_front : int
+        Padding size on front
+    pad_top : int
+        Padding size on top
+    pad_left : int
+        Padding size on left
+    pad_back : int
+        Padding size on back
+    pad_down : int
+        Padding size on down.
+    pad_right : int
+        Padding size on right.
+    """
+    # compute the padding size
+    if isinstance(padding, container.Array):
+        padding = list(padding)
+    if isinstance(padding, (tuple, list)):
+        if len(padding) == 3:
+            pad_d = padding[0] * 2
+            pad_h = padding[1] * 2
+            pad_w = padding[2] * 2
+        elif len(padding) == 6:
+            return padding[0], padding[1], padding[2], padding[3], padding[4], padding[5]
+        else:
+            raise ValueError("Size of padding can only be 3 or 6")
+    elif isinstance(padding, int):
+        pad_d, pad_h = pad_w = padding * 2
+    else:
+        raise ValueError("Unknown padding option %s" % padding)
+    pad_front = (pad_d + 1) // 2
+    pad_top = (pad_h + 1) // 2
+    pad_left = (pad_w + 1) // 2
+    return pad_front, pad_top, pad_left, pad_d - pad_fornt, pad_h - pad_top, pad_w - pad_left
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 3e3bc36693e8..2bd4d2672753 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -264,6 +264,26 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                                     plevel=15)
     return strategy
 
+@conv3d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"])
+def conv3d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
+    """conv3d_winograd_without_weight_transfrom cuda strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    assert dilation == (1, 1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout == "NCDHW":
+        strategy.add_implementation(
+            wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd_without_weight_transform),
+            wrap_topi_schedule(
+                topi.cuda.schedule_conv3d_ncdhw_winograd_without_weight_transform),
+            name="conv3d_ncdhw_winograd_without_weight_transform.cuda")
+    else:
+        raise RuntimeError("Unsupported conv3d_winograd_without_weight_transfrom layout {}".
+                           format(layout))
+    return strategy
+
 @conv1d_strategy.register(["cuda", "gpu"])
 def conv1d_strategy_cuda(attrs, inputs, out_type, target):
     """conv1d cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 573df3675eee..388e104dca29 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -374,6 +374,19 @@ def conv3d_strategy(attrs, inputs, out_type, target):
         raise ValueError("Not support this layout {} yet".format(layout))
     return strategy
 
+# conv3d_winograd_without_weight_transform
+@override_native_generic_func("conv3d_winograd_without_weight_transform_strategy")
+def conv3d_winograd_without_weight_transfrom_strategy(attrs, inputs, out_type, target):
+    """conv3d_winograd_without_weight_transfrom generic strategy"""
+    raise ValueError("No generic implemenation for conv3d_winograd_without_weight_transform")
+
+# conv3d_winograd_weight_transform
+@generic_func
+def schedule_conv3d_winograd_weight_transform(attrs, outs, target):
+    """Schedule conv3d_winograd_weight_transform"""
+    with target:
+        return topi.generic.schedule_conv3d_winograd_weight_transform(outs)
+
 # conv1d
 def wrap_compute_conv1d(topi_compute):
     """wrap conv1d topi compute"""
diff --git a/topi/python/topi/cuda/conv3d_alter_op.py b/topi/python/topi/cuda/conv3d_alter_op.py
new file mode 100644
index 000000000000..c65654fbeaa1
--- /dev/null
+++ b/topi/python/topi/cuda/conv3d_alter_op.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Conv3D alter op and legalize functions for cuda backend"""
+
+import logging
+import tvm
+from tvm import te
+from tvm import relay
+from tvm import autotvm
+
+from .. import nn
+from ..util import get_const_tuple
+from .conv3d_winograd import _infer_tile_size
+
+logger = logging.getLogger('topi')
+
+@nn.conv3d_alter_layout.register(["cuda", "gpu"])
+def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+
+    _, outs = relay.backend.compile_engine.select_implementation(
+        relay.op.get("nn.conv3d"), attrs, tinfos, out_type, target)
+    workload = autotvm.task.get_workload(outs)
+    if workload is None:
+        # The best implementation is not an AutoTVM template,
+        # we then assume it's not necessary to alter this op.
+        return None
+    cfg = dispatch_ctx.query(target, workload)
+    if cfg.is_fallback:  # if is fallback, clear query cache and return None
+        autotvm.task.clear_fallback_cache(target, workload)
+        return None
+
+    topi_tmpl = workload[0]
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int('groups')
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
+
+    if topi_tmpl == "conv3d_ncdhw_winograd_cuda":
+        if dilation != (1, 1, 1):
+            logger.warning("Does not support weight pre-transform for dilated 3D convolution.")
+            return None
+        
+        assert data_layout == "NCDHW" and kernel_layout == "OIDHW"
+        N, CI, D, H, W = get_const_tuple(data.shape)
+        CO, _, KD, KH, KW = get_const_tuple(kernel.shape)
+
+        # Pre-compute weight transformation in winograd
+        tile_size = _infer_tile_size(tinfos[0], tinfos[1])
+
+        weight = relay.nn.contrib_conv3d_winograd_weight_transform(inputs[1], tile_size=tile_size)
+        new_attrs['tile_size'] = tile_size
+        new_attrs['channels'] = CO
+
+        # Store the same config for the altered operators (workload)
+        new_data = data
+        new_weight = te.placeholder(
+            (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CI, CO),
+            dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_weight, strides, padding, dilation, out_dtype],
+            "conv3d_ncdhw_winograd_without_weight_transform.cuda")
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv3d_winograd_without_weight_transform(
+            inputs[0], weight, **new_attrs)
+
+    return None
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 43b12822b239..f7e2a97de314 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -187,6 +187,35 @@ def schedule_conv2d_winograd_weight_transform(outs):
     return s
 
 
+def schedule_conv3d_winograd_weight_transform(outs):
+    """Schedule for weight transformation of 3D winograd
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    # Typically this is computed in PreCompute pass
+    # so we make a schedule here for cpu llvm
+    s = te.create_schedule([x.op for x in outs])
+    output = outs[0]
+    _, G = s[output].op.input_tensors
+    s[G].compute_inline()
+    omg, eps, nu, co, ci = s[output].op.axis
+    r_kd, r_kh, r_kw = s[output].op.reduce_axis
+    s[output].reorder(co, ci, r_kd, r_kh, r_kw, omg, eps, nu)
+    for axis in [r_kd, r_kh, r_kw, omg, eps, nu]:
+        s[output].unroll(axis)
+    s[output].parallel(co)
+    return s
+
+
 def schedule_conv2d_winograd_without_weight_transform(outs):
     """Schedule for winograd without weight transformation
 
diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py
index d6bd6424a947..e66e4fe6edb5 100644
--- a/topi/python/topi/nn/conv3d.py
+++ b/topi/python/topi/nn/conv3d.py
@@ -21,7 +21,8 @@
 
 from .pad import pad
 from .util import get_pad_tuple3d
-from ..util import simplify
+from ..util import simplify, get_const_tuple
+from .winograd_util import winograd_transform_matrice
 
 
 def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None):
@@ -159,3 +160,58 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
             Filter[rd, rh, rw, rc, cc].astype(out_dtype), axis=[rd, rh, rw, rc]),
         name="Conv3dOutput", tag="conv3d_ndhwc")
     return Output
+
+
+def conv3d_winograd_weight_transform(kernel, tile_size):
+    """Weight transformation for 3D winograd
+
+    Parameters
+    ----------
+    kernel: Tensor
+        The raw kernel tensor with layout "NCDHW".
+    tile_size: int
+        Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        5-D with shape [alpha, alpha, alpha, CO, CI]
+    """
+    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
+    assert KD == KH == KW, "Only support NxNxN kernel"
+
+    r = tile_size + KD - 1
+    shape = (r, r, r) + [CI, CO]
+
+    _, _, G = winograd_transform_matrices(tile_size, KD, kernel.dtype)
+
+    r_kd = te.reduce_axis((0, KD), name='r_kd')
+    r_kh = te.reduce_axis((0, KH), name='r_kh')
+    r_kw = te.reduce_axis((0, KW), name='r_kw')
+    return te.compute(shape, lambda omg, eps, nu, ci, co:
+                      te.sum(kernel[co][ci][r_kd][r_kh][r_kw] *
+                             G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
+                             axis=[r_kd, r_kh, r_kw]), name='transform_weight')
+
+
+@tvm.target.generic_func
+def conv3d_alter_layout(attrs, inputs, tinfos, out_type):
+    """Change Conv3D layout.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
+    """
+    # not to change by default
+    return None
\ No newline at end of file

From 71379ae84c29d77b04439eb3763abdc11e6eea4e Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Mon, 30 Mar 2020 17:23:47 -0700
Subject: [PATCH 12/28] Bug fixes and testing done.

---
 python/tvm/relay/op/nn/util.py                |  2 +-
 python/tvm/relay/op/strategy/cuda.py          |  6 +-
 src/relay/op/nn/convolution.cc                |  4 +-
 src/relay/op/nn/convolution.h                 | 39 +++++----
 tests/python/relay/test_op_level2.py          | 87 ++++++++++++++++++-
 topi/python/topi/cuda/conv3d_alter_op.py      |  2 +-
 topi/python/topi/cuda/conv3d_winograd.py      |  9 +-
 topi/python/topi/generic/nn.py                |  4 +-
 topi/python/topi/nn/conv3d.py                 |  7 +-
 .../tests/python/test_topi_conv3d_winograd.py |  2 +
 10 files changed, 127 insertions(+), 35 deletions(-)

diff --git a/python/tvm/relay/op/nn/util.py b/python/tvm/relay/op/nn/util.py
index b2a649db8d69..417da97af16d 100644
--- a/python/tvm/relay/op/nn/util.py
+++ b/python/tvm/relay/op/nn/util.py
@@ -96,4 +96,4 @@ def get_pad_tuple3d(padding):
     pad_front = (pad_d + 1) // 2
     pad_top = (pad_h + 1) // 2
     pad_left = (pad_w + 1) // 2
-    return pad_front, pad_top, pad_left, pad_d - pad_fornt, pad_h - pad_top, pad_w - pad_left
+    return pad_front, pad_top, pad_left, pad_d - pad_front, pad_h - pad_top, pad_w - pad_left
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 2bd4d2672753..55dc9250bbb8 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -233,10 +233,10 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 def conv3d_strategy_cuda(attrs, inputs, out_type, target):
     """conv3d cuda strategy"""
     strategy = _op.OpStrategy()
-    data, kernel = inputs
+    _, kernel = inputs
     layout = attrs.data_layout
     stride_d, stride_h, stride_w = attrs.get_int_tuple("strides")
-    dilation_d, dilation_h, dilation_w = attrs.get_int_tuple("dilation")                                   
+    dilation_d, dilation_h, dilation_w = attrs.get_int_tuple("dilation")
     assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout)
     if layout == "NCDHW":
         strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw),
@@ -244,7 +244,7 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                                     name="conv3d_ncdhw.cuda",
                                     plevel=10)
         _, _, kd, kh, kw = get_const_tuple(kernel.shape)
-        if 2< kd < 8 and 2 < kh < 8 and 2 < kw < 8 and kd == kh and kh == kw and \
+        if 2 < kd < 8 and 2 < kh < 8 and 2 < kw < 8 and kd == kh and kh == kw and \
             stride_d == 1 and stride_h == 1 and stride_w == 1 and \
             dilation_d == 1 and dilation_h == 1 and dilation_w == 1:
             strategy.add_implementation(
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index c31ebaadb602..019da3279a57 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -498,7 +498,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_without_weight_tr
   return MakeConvWinograd<Conv3DWinogradAttrs>(
     data, weight, tile_size, strides, padding, dilation,
     groups, channels, kernel_size, data_layout,
-    kernel_layout, out_layout, out_dtype, "nn.contrib.conv3d_winograd_without_weight_transform");
+    kernel_layout, out_layout, out_dtype, "nn.contrib_conv3d_winograd_without_weight_transform");
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
@@ -527,7 +527,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform"
 .set_body_typed([](Expr weight,
                    int tile_size) {
   return MakeConvWinogradWeightTransform(
-    weight, tile_size, "nn.contrib.conv3d_winograd_weight_transform");
+    weight, tile_size, "nn.contrib_conv3d_winograd_weight_transform");
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_weight_transform")
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index b82a3b16f22e..fb41bce6645e 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -27,6 +27,7 @@
 #include <tvm/tir/ir_pass.h>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "../op_common.h"
 
@@ -411,8 +412,8 @@ bool Conv3DWinogradWeightTransformRel(const Array<Type>& types,
       param->tile_size + data->shape[2] - 1,
       param->tile_size + data->shape[3] - 1,
       param->tile_size + data->shape[4] - 1,
-      data->shape[0],
       data->shape[1],
+      data->shape[0],
   };
 
   reporter->Assign(types[1], TensorType(Array<IndexExpr>(oshape),
@@ -490,7 +491,7 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
   CHECK(param->kernel_size.defined() && param->channels.defined())
-      << "The kernel size and channels of a Conv must be set or infered by previous pass";
+      << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
   CHECK_EQ(param->kernel_size.size(), 2);
   CHECK_EQ(param->dilation.size(), 2);
@@ -572,7 +573,7 @@ bool Conv3DWinogradRel(const Array<Type>& types,
   IndexExpr channels, dilated_ksize_d, dilated_ksize_y, dilated_ksize_x;
 
   CHECK(param->kernel_size.defined() && param->channels.defined())
-      << "The kernel size and channels of a Conv must be set or infered by previous pass";
+      << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
   CHECK_EQ(param->kernel_size.size(), 3);
   CHECK_EQ(param->dilation.size(), 3);
@@ -815,22 +816,6 @@ bool Conv2DTransposeRel(const Array<Type>& types,
 }
 
 
-template<typename T>
-Array<Array<Layout> > ConvInferCorrectLayout(
-    const Attrs& attrs,
-    const Array<Layout>& new_in_layouts,
-    const Array<Layout>& old_in_layouts,
-    const Array<tvm::relay::Type> &old_in_types) {
-  const T* params = attrs.as<T>();
-
-  // We always make other operators to fit the layouts of convolution layers
-  // So this inference ignores all inputs
-  return Array<Array<Layout> >{{params->data_layout, params->kernel_layout},
-                               {params->out_layout == "" ?
-                                   params->data_layout : params->out_layout}};
-}
-
-
 // Deformable Convolution shape relations.
 template <typename AttrType>
 bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
@@ -912,6 +897,22 @@ bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs&
 }
 
 
+template<typename T>
+Array<Array<Layout> > ConvInferCorrectLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<tvm::relay::Type> &old_in_types) {
+  const T* params = attrs.as<T>();
+
+  // We always make other operators to fit the layouts of convolution layers
+  // So this inference ignores all inputs
+  return Array<Array<Layout> >{{params->data_layout, params->kernel_layout},
+                               {params->out_layout == "" ?
+                                   params->data_layout : params->out_layout}};
+}
+
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_NN_CONVOLUTION_H_
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 7a42fc329e04..f46dec7607f9 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -25,6 +25,7 @@
 from tvm.relay.testing import ctx_list, run_infer_type
 from tvm.contrib import util
 import topi.testing
+from topi.cuda.conv3d_winograd import _infer_tile_size
 
 
 def test_conv1d_infer_type():
@@ -326,7 +327,7 @@ def _query_inside(self, target, workload):
             cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
             cfg['tile_x'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
             cfg['tile_rc'] = autotvm.task.space.SplitEntity([-1, 1])
-            cfg['auto_unroll_max_setp'] = autotvm.task.space.OtherOptionEntity(1500)
+            cfg['auto_unroll_max_step'] = autotvm.task.space.OtherOptionEntity(1500)
             cfg['unroll_explicit'] = autotvm.task.space.OtherOptionEntity(1)
             self.memory[key] = cfg
             return cfg
@@ -522,6 +523,89 @@ def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
     run_test_conv3d("float32", "float32", 1, dshape, kshape,
             padding=(1, 1, 1), channels=10, kernel_size=(3, 3 ,3), except_targets=["cuda"])
 
+def test_conv3d_winograd():
+    class WinogradFallback(autotvm.FallbackContext):
+        def _query_inside(self, target, workload):
+            key = (target, workload)
+            if key in self.memory:
+                return self.memory[key]
+            cfg = autotvm.task.space.FallbackConfigEntity()
+            cfg.is_fallback = False
+            cfg.cost = 0.1 if 'winograd' in workload[0] else 1
+            cfg['tile_b'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
+            cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
+            cfg['tile_x'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
+            cfg['tile_rc'] = autotvm.task.space.SplitEntity([-1, 1])
+            cfg['auto_unroll_max_step'] = autotvm.task.space.OtherOptionEntity(0)
+            cfg['unroll_explicit'] = autotvm.task.space.OtherOptionEntity(1)
+            self.memory[key] = cfg
+            return cfg
+
+    def run_test_conv3d_cuda(dtype, out_dtype, scale, dshape, kshape,
+                             padding=(1, 1, 1),
+                             groups=1,
+                             dilation=(1, 1, 1),
+                             prepack=False,
+                             **attrs):
+
+        x = relay.var("x", shape=dshape, dtype=dtype)
+        w = relay.var("w", shape=kshape, dtype=dtype)
+        if prepack:
+            tile_size = _infer_tile_size(np.zeros(shape=dshape), np.zeros(shape=kshape))
+            w_packed = relay.nn.contrib_conv3d_winograd_weight_transform(w, tile_size)
+
+            y = relay.nn.contrib_conv3d_winograd_without_weight_transform(
+                x, w_packed, tile_size,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                channels=kshape[0],
+                **attrs)
+        else:
+            y = relay.nn.conv3d(x, w,
+                                padding=padding,
+                                dilation=dilation,
+                                groups=groups,
+                                **attrs)
+        func = relay.Function([x, w], y)
+        mod = tvm.IRModule()
+        mod['main'] = func
+        mod = relay.transform.InferType()(mod)
+
+        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
+        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
+        ref_res = topi.testing.conv3d_ncdhw_python(
+            data.astype(out_dtype), kernel.astype(out_dtype), 1, padding,
+            groups=groups)
+
+        with WinogradFallback(), relay.build_config(opt_level=3):
+            for target, ctx in ctx_list():
+                if target != 'cuda':
+                    continue
+                params = {'w': tvm.nd.array(kernel)}
+                graph, lib, params = relay.build_module.build(mod, target=target, params=params)
+                module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
+                module.set_input('x', tvm.nd.array(data))
+                module.set_input(**params)
+                module.run()
+                op_res1 = module.get_output(0)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-3, atol=1e-3)
+
+    # normal winograd: stride 1, padding 1, kernel 3x3x3
+    dshape = (1, 32, 16, 16, 16)
+    kshape = (64, 32, 3, 3, 3)
+    run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(1, 1, 1), kernel_size=(3, 3, 3))
+    run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(1, 1, 1), kernel_size=(3, 3, 3), prepack=True)
+    # extended winograd: stride 1, padding N, kernel NxNxN
+    dshape = (1, 61, 20, 20, 20)
+    kshape = (120, 61, 5, 5, 5)
+    run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(2, 2, 2), channels=120, kernel_size=(5, 5, 5))
+    run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
+                         padding=(2, 2, 2), kernel_size=(5, 5, 5), prepack=True)
+
 
 def test_conv2d_transpose_infer_type():
     # symbolic in batch dimension
@@ -1268,6 +1352,7 @@ def test_bitpack_infer_type():
     test_conv2d_winograd()
     test_conv3d_run()
     test_conv3d_ndhwc_run()
+    test_conv3d_winograd()
     test_bitserial_conv2d_infer_type()
     test_batch_flatten()
     test_upsampling()
diff --git a/topi/python/topi/cuda/conv3d_alter_op.py b/topi/python/topi/cuda/conv3d_alter_op.py
index c65654fbeaa1..d207b6894616 100644
--- a/topi/python/topi/cuda/conv3d_alter_op.py
+++ b/topi/python/topi/cuda/conv3d_alter_op.py
@@ -62,7 +62,7 @@ def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
         if dilation != (1, 1, 1):
             logger.warning("Does not support weight pre-transform for dilated 3D convolution.")
             return None
-        
+
         assert data_layout == "NCDHW" and kernel_layout == "OIDHW"
         N, CI, D, H, W = get_const_tuple(data.shape)
         CO, _, KD, KH, KW = get_const_tuple(kernel.shape)
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index ba3d3b85de83..b1ac6ff5b660 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -60,7 +60,8 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
         # dilation is not supported
         alpha, _, _, CI, CO = get_const_tuple(kernel.shape)
         KD = KH = KW = alpha + 1 - tile_size
-        assert DSTR == 1 and HSTR == 1 and WSTR == 1 and dilation_d == 1 and dilation_h == 1 and dilation_w == 1
+        assert DSTR == 1 and HSTR == 1 and WSTR == 1 and \
+               dilation_d == 1 and dilation_h == 1 and dilation_w == 1
 
     pf, pt, pl, pb, pd, pr = nn.get_pad_tuple3d(padding, (KD, KH, KW))
     data_pad = nn.pad(data, (0, 0, pf, pt, pl), (0, 0, pb, pd, pr), name="data_pad")
@@ -187,7 +188,8 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
         else:
             s[G].compute_inline()
             r_a, r_b, r_c = s[kernel_pack].op.reduce_axis
-            for axis in [omg, eps, nu, r_a, r_b, r_c]:
+            # Could add additional unrolling by omg, eps, nu in the future.
+            for axis in [r_a, r_b, r_c]:
                 s[kernel_pack].unroll(axis)
 
             fused = s[kernel_pack].fuse(ci, co)
@@ -300,7 +302,8 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     s[A].compute_inline()
     co, p, vd, vh, vw = s[inverse].op.axis
     r_a, r_b, r_c = s[inverse].op.reduce_axis
-    for axis in [vd, vh, vw, r_a, r_b, r_c]:
+    # Could add additional unrolling of vd, vh, vw, in the future
+    for axis in [r_a, r_b, r_c]:
         s[inverse].unroll(axis)
     s[inverse].compute_at(s[output], tt)
 
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index f7e2a97de314..0d78bac0133a 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -209,8 +209,8 @@ def schedule_conv3d_winograd_weight_transform(outs):
     s[G].compute_inline()
     omg, eps, nu, co, ci = s[output].op.axis
     r_kd, r_kh, r_kw = s[output].op.reduce_axis
-    s[output].reorder(co, ci, r_kd, r_kh, r_kw, omg, eps, nu)
-    for axis in [r_kd, r_kh, r_kw, omg, eps, nu]:
+    s[output].reorder(co, ci, omg, eps, nu, r_kd, r_kh, r_kw)
+    for axis in [r_kd, r_kh, r_kw]:
         s[output].unroll(axis)
     s[output].parallel(co)
     return s
diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py
index e66e4fe6edb5..409db3ee2449 100644
--- a/topi/python/topi/nn/conv3d.py
+++ b/topi/python/topi/nn/conv3d.py
@@ -17,12 +17,13 @@
 # pylint: disable=invalid-name, unused-variable, too-many-locals
 # pylint: disable=unused-argument, redefined-builtin, no-else-return
 """Conv3D operators"""
+import tvm
 from tvm import te
 
 from .pad import pad
 from .util import get_pad_tuple3d
 from ..util import simplify, get_const_tuple
-from .winograd_util import winograd_transform_matrice
+from .winograd_util import winograd_transform_matrices
 
 
 def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None):
@@ -181,7 +182,7 @@ def conv3d_winograd_weight_transform(kernel, tile_size):
     assert KD == KH == KW, "Only support NxNxN kernel"
 
     r = tile_size + KD - 1
-    shape = (r, r, r) + [CI, CO]
+    shape = (r, r, r) + (CI, CO)
 
     _, _, G = winograd_transform_matrices(tile_size, KD, kernel.dtype)
 
@@ -214,4 +215,4 @@ def conv3d_alter_layout(attrs, inputs, tinfos, out_type):
     Unlike other TOPI functions, this function operates on both graph level and operator level.
     """
     # not to change by default
-    return None
\ No newline at end of file
+    return None
diff --git a/topi/tests/python/test_topi_conv3d_winograd.py b/topi/tests/python/test_topi_conv3d_winograd.py
index 28722fe749fc..89d8e269c2af 100644
--- a/topi/tests/python/test_topi_conv3d_winograd.py
+++ b/topi/tests/python/test_topi_conv3d_winograd.py
@@ -119,6 +119,8 @@ def check_device(device):
 def test_conv3d_ncdhw():
     #3DCNN  workloads
     verify_conv3d_ncdhw(1, 61, 20, 120, 3, 1, 0)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 5, 1, 2)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 7, 1, 3)
     verify_conv3d_ncdhw(1, 128, 12, 256, 3, 1, 1)
     verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1)
 

From 8f13b7678d84ae6e58f67de18793b6688506f708 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 31 Mar 2020 16:24:14 -0700
Subject: [PATCH 13/28] Fix a few autotvm bugs.

---
 python/tvm/relay/op/nn/nn.py             | 3 +--
 python/tvm/relay/op/nn/util.py           | 2 +-
 topi/python/topi/cuda/__init__.py        | 1 +
 topi/python/topi/cuda/conv3d_alter_op.py | 2 +-
 topi/python/topi/x86/conv2d_alter_op.py  | 1 +
 5 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index a46539ef4d06..eb0015921ef1 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -295,8 +295,7 @@ def conv3d(data,
         strides = (strides, strides, strides)
     if isinstance(dilation, int):
         dilation = (dilation, dilation, dilation)
-    if isinstance(padding, int):
-        padding = (padding, padding, padding)
+    padding = get_pad_tuple3d(padding)
     return _make.conv3d(data, weight, strides, padding, dilation,
                         groups, channels, kernel_size, data_layout,
                         kernel_layout, out_layout, out_dtype)
diff --git a/python/tvm/relay/op/nn/util.py b/python/tvm/relay/op/nn/util.py
index 417da97af16d..1fdcad73c74e 100644
--- a/python/tvm/relay/op/nn/util.py
+++ b/python/tvm/relay/op/nn/util.py
@@ -90,7 +90,7 @@ def get_pad_tuple3d(padding):
         else:
             raise ValueError("Size of padding can only be 3 or 6")
     elif isinstance(padding, int):
-        pad_d, pad_h = pad_w = padding * 2
+        pad_d = pad_h = pad_w = padding * 2
     else:
         raise ValueError("Unknown padding option %s" % padding)
     pad_front = (pad_d + 1) // 2
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index ad385a78fb28..83ddedc996fe 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -32,6 +32,7 @@
 from .deformable_conv2d import *
 from .conv3d import *
 from .conv3d_winograd import *
+from . import conv3d_alter_op
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
diff --git a/topi/python/topi/cuda/conv3d_alter_op.py b/topi/python/topi/cuda/conv3d_alter_op.py
index d207b6894616..395612dd7e31 100644
--- a/topi/python/topi/cuda/conv3d_alter_op.py
+++ b/topi/python/topi/cuda/conv3d_alter_op.py
@@ -58,7 +58,7 @@ def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
-    if topi_tmpl == "conv3d_ncdhw_winograd_cuda":
+    if topi_tmpl == "conv3d_ncdhw_winograd.cuda":
         if dilation != (1, 1, 1):
             logger.warning("Does not support weight pre-transform for dilated 3D convolution.")
             return None
diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index 5ee691b07362..36d32795d84b 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -33,6 +33,7 @@
 
 @conv2d_alter_layout.register("cpu")
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    print("HERE")
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
     if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):

From e9c2da4d2d963b034f9e38190e54d6aa6920d5b2 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 31 Mar 2020 16:26:04 -0700
Subject: [PATCH 14/28] Drop silly debug print.

---
 topi/python/topi/x86/conv2d_alter_op.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index 36d32795d84b..5ee691b07362 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -33,7 +33,6 @@
 
 @conv2d_alter_layout.register("cpu")
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
-    print("HERE")
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
     if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):

From 69bf95ed54f2394891d55fa00d9023e4f6351188 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 1 Apr 2020 16:19:39 -0700
Subject: [PATCH 15/28] Removed debug_skip_region.

---
 topi/python/topi/cuda/conv3d_winograd.py | 55 ++++++++++++------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index b1ac6ff5b660..9bd270b092db 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -78,15 +78,22 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
 
     # transform kernel
     if not pre_computed:
-        r_kd = te.reduce_axis((0, KD), name='r_kd')
-        r_kh = te.reduce_axis((0, KH), name='r_kh')
-        r_kw = te.reduce_axis((0, KW), name='r_kw')
-        kernel_pack = te.compute(
-            (alpha, alpha, alpha, CI, CO),
-            lambda omg, eps, nu, ci, co: te.sum(
-                kernel[co][ci][r_kd][r_kh][r_kw] * G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
-                axis=[r_kd, r_kh, r_kw]),
-            name='kernel_pack')
+        # Check if we are currently tuning, if so we want to avoid counting
+        # prepacking in time costs. Just use a placeholder with the packed shape instead.
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            kernel_pack = te.placeholder((alpha, alpha, alpha, CI, CO),
+                                         dtype=kernel.dtype,
+                                         name='kernel_pack')
+        else:
+            r_kd = te.reduce_axis((0, KD), name='r_kd')
+            r_kh = te.reduce_axis((0, KH), name='r_kh')
+            r_kw = te.reduce_axis((0, KW), name='r_kw')
+            kernel_pack = te.compute(
+                (alpha, alpha, alpha, CI, CO),
+                lambda omg, eps, nu, ci, co: te.sum(
+                    kernel[co][ci][r_kd][r_kh][r_kw] * G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
+                    axis=[r_kd, r_kh, r_kw]),
+                name='kernel_pack')
     else:
         kernel_pack = kernel
 
@@ -177,26 +184,20 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     s[pad_data].compute_inline()
 
     # transform kernel
-    if not pre_computed:
+    if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
         kernel, G = s[kernel_pack].op.input_tensors
         omg, eps, nu, ci, co = s[kernel_pack].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # skip this part during tuning to make recrods accurate
-            # this part will be pre-computed during pre-compute optimization pass
-            s[G].pragma(s[G].op.axis[0], 'debug_skip_region')
-            s[kernel_pack].pragma(eps, 'debug_skip_region')
-        else:
-            s[G].compute_inline()
-            r_a, r_b, r_c = s[kernel_pack].op.reduce_axis
-            # Could add additional unrolling by omg, eps, nu in the future.
-            for axis in [r_a, r_b, r_c]:
-                s[kernel_pack].unroll(axis)
-
-            fused = s[kernel_pack].fuse(ci, co)
-            bb, tt = s[kernel_pack].split(fused, 128)
-            s[kernel_pack].reorder(bb, tt, omg, eps, nu, r_a, r_b, r_c)
-            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
-            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
+        s[G].compute_inline()
+        r_a, r_b, r_c = s[kernel_pack].op.reduce_axis
+        # Could add additional unrolling by omg, eps, nu in the future.
+        for axis in [r_a, r_b, r_c]:
+            s[kernel_pack].unroll(axis)
+
+        fused = s[kernel_pack].fuse(ci, co)
+        bb, tt = s[kernel_pack].split(fused, 128)
+        s[kernel_pack].reorder(bb, tt, omg, eps, nu, r_a, r_b, r_c)
+        s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
+        s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
     else:
         kernel = kernel_pack
 

From b51e08331ff875c5aa380aff3e6fad4f2210a5ca Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 31 Mar 2020 18:49:09 -0700
Subject: [PATCH 16/28] Add variant of conv3d_winograd that doesn't transform
 depth.

---
 topi/python/topi/cuda/conv3d_winograd.py | 103 ++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 9bd270b092db..fe424683d248 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..util import get_const_int, get_const_tuple, traverse_inline, simplify
 from ..nn.winograd_util import winograd_transform_matrices
 
 logger = logging.getLogger('conv3d_winograd')
@@ -32,7 +32,7 @@
 def _infer_tile_size(data, kernel):
     N, CI, D, H, W = get_const_tuple(data.shape)
 
-    if D % 8 == 0:
+    if H % 8 == 0:
         return 4
     return 2
 
@@ -151,6 +151,99 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
     return output
 
 
+def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
+                                pre_computed):
+    """Compute declaration for winograd without transforming depth"""
+    tile_size = _infer_tile_size(data, kernel)
+
+    N, CI, D, H, W = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_d = dilation_h = dilation_w = dilation
+    else:
+        dilation_d, dilation_h, dilation_w = dilation
+    DSTR, HSTR, WSTR = (strides, strides, strides) if isinstance(strides, int) else strides
+
+    if not pre_computed:  # kernel tensor is raw tensor, do strict check
+        if dilation_d != 1 or dilation_h != 1 or dilation_w != 1:
+            kernel = nn.dilate(kernel, (1, 1, dilation_d, dilation_h, dilation_w))
+        CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
+        alpha = KW + tile_size - 1
+        assert HSTR == 1 and WSTR == 1 and KH == KW
+    else:
+        # kernel tensor is pre-transfomred. this op is created by alter op layout.
+        # dilation is not supported
+        alpha, _, KD, CI, CO = get_const_tuple(kernel.shape)
+        KH = KW = alpha + 1 - tile_size
+        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
+
+    pf, pt, pl, pb, pd, pr = nn.get_pad_tuple3d(padding, (KD, KH, KW))
+    data_pad = nn.pad(data, (0, 0, pf, pt, pl), (0, 0, pb, pd, pr), name="data_pad")
+    out_depth = simplify((D - KD + pf + pb ) // DSTR + 1)
+    D += pf + pb
+
+    r = KW
+    m = tile_size
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    H = (H + pt + pd - KH) // HSTR + 1
+    W = (W + pl + pr - KW) // WSTR + 1
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    # transform kernel
+    if not pre_computed:
+        r_kh = te.reduce_axis((0, KH), name='r_kh')
+        r_kw = te.reduce_axis((0, KW), name='r_kw')
+        kernel_pack = te.compute((alpha, alpha, KD, CI, CO), lambda eps, nu, d, ci, co:
+                                 te.sum(kernel[co][ci][d][r_kh][r_kw] *
+                                        G[eps][r_kh] * G[nu][r_kw],
+                                        axis=[r_kh, r_kw]), name='kernel_pack')
+    else:
+        kernel_pack = kernel
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+    # pack input tile
+    input_tile = te.compute((CI, D, P, alpha, alpha), lambda c, d, p, eps, nu:
+                            data_pad[idxdiv(p, (nH * nW))][c][d][idxmod(idxdiv(p, nW), nH) * m + eps]
+                            [idxmod(p, nW) * m + nu], name='d')
+
+    # transform data
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_a')
+    data_pack = te.compute((alpha, alpha, CI, D, P), lambda eps, nu, ci, d, p:
+                           te.sum(input_tile[ci][d][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
+                                  axis=[r_a, r_b]), name='data_pack')
+
+    # do batch gemm
+    ci = te.reduce_axis((0, CI), name='ci')
+    rz = te.reduce_axis((0, KD), name='rz')
+    bgemm = te.compute((alpha, alpha, CO, out_depth, P), lambda eps, nu, co, d, p:
+                       te.sum(kernel_pack[eps][nu][rz][ci][co] *
+                              data_pack[eps][nu][ci][d * DSTR + rz][p],
+                              axis=[ci, rz]), name='bgemm')
+
+    # inverse transform
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_a')
+    inverse = te.compute((CO, out_depth, P, m, m), lambda co, d, p, vh, vw:
+                         te.sum(bgemm[r_a][r_b][co][d][p] * A[r_a][vh] * A[r_b][vw],
+                                axis=[r_a, r_b]), name='inverse')
+
+    # output
+    output = te.compute((N, CO, out_depth, H, W), lambda n, co, d, h, w:
+                        inverse[co,
+                                d,
+                                n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
+                                idxmod(h, m),
+                                idxmod(w, m)],
+                        name='output', tag='conv2d_nchw_winograd')
+    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
+
+    return output
+
+
 def schedule_winograd_cuda(cfg, s, output, pre_computed):
     """Schedule winograd template"""
     # get stages
@@ -317,6 +410,12 @@ def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dty
         cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
 
 
+@autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_depth.cuda")
+def conv3d_ncdhw_winograd_without_depth(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    return winograd_without_depth_cuda(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
+
+
 @autotvm.register_topi_schedule("conv3d_ncdhw_winograd.cuda")
 def schedule_conv3d_ncdhw_winograd(cfg, outs):
     s = te.create_schedule([x.op for x in outs])

From 1adfbe0f8da61898bb0bbab6ab539927a47d74bd Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Thu, 2 Apr 2020 17:53:41 -0700
Subject: [PATCH 17/28] initial infrastructure done for depthless conv.

---
 src/relay/op/nn/convolution.h                 |  25 +-
 topi/python/topi/cuda/conv3d_winograd.py      | 219 ++++++++++++++++--
 topi/python/topi/nn/conv3d.py                 |  34 ++-
 .../tests/python/test_topi_conv3d_winograd.py |  35 +--
 4 files changed, 253 insertions(+), 60 deletions(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index fb41bce6645e..bdea4d99a17a 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -379,7 +379,6 @@ bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
 
   CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
 
-  // each pad width element should be a pair of positive integers
   std::vector<IndexExpr> oshape {
       param->tile_size + data->shape[2] - 1,
       param->tile_size + data->shape[3] - 1,
@@ -407,17 +406,21 @@ bool Conv3DWinogradWeightTransformRel(const Array<Type>& types,
 
   CHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
 
-  // each pad width element should be a pair of positive integers
-  std::vector<IndexExpr> oshape {
-      param->tile_size + data->shape[2] - 1,
-      param->tile_size + data->shape[3] - 1,
-      param->tile_size + data->shape[4] - 1,
-      data->shape[1],
-      data->shape[0],
-  };
+  // Shape of packed weights depends on whether depth is being transformed or not.
+  Array<IndexExpr> oshape({0, 0, 0, data->shape[1], data->shape[0]});
+  auto* depth_imm = data->shape[2].as<IntImmNode>();
+  bool transform_depth = (depth_imm->value > 2) and (depth_imm->value < 8);
+  if (transform_depth) {
+    oshape.Set(0, param->tile_size + data->shape[2] - 1);
+    oshape.Set(1, param->tile_size + data->shape[3] - 1);
+    oshape.Set(2, param->tile_size + data->shape[4] - 1);
+  } else {
+    oshape.Set(0, param->tile_size + data->shape[3] - 1);
+    oshape.Set(1, param->tile_size + data->shape[4] - 1);
+    oshape.Set(2, data->shape[2]);
+  }
 
-  reporter->Assign(types[1], TensorType(Array<IndexExpr>(oshape),
-                                                  data->dtype));
+  reporter->Assign(types[1], TensorType(oshape, data->dtype));
   return true;
 }
 
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index fe424683d248..150ed1e6d6d4 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -193,12 +193,19 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
 
     # transform kernel
     if not pre_computed:
-        r_kh = te.reduce_axis((0, KH), name='r_kh')
-        r_kw = te.reduce_axis((0, KW), name='r_kw')
-        kernel_pack = te.compute((alpha, alpha, KD, CI, CO), lambda eps, nu, d, ci, co:
-                                 te.sum(kernel[co][ci][d][r_kh][r_kw] *
-                                        G[eps][r_kh] * G[nu][r_kw],
-                                        axis=[r_kh, r_kw]), name='kernel_pack')
+        # During autotuning dont count kernel packing as a time cost
+        # as it will later be removed via alter_op_layout.
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            kernel_pack = te.placeholder((alpha, alpha, KD, CI, CO),
+                                         dtype=kernel.dtype,
+                                         name='kernel_pack')
+        else:
+            r_kh = te.reduce_axis((0, KH), name='r_kh')
+            r_kw = te.reduce_axis((0, KW), name='r_kw')
+            kernel_pack = te.compute((alpha, alpha, KD, CI, CO), lambda eps, nu, d, ci, co:
+                                    te.sum(kernel[co][ci][d][r_kh][r_kw] *
+                                            G[eps][r_kh] * G[nu][r_kw],
+                                            axis=[r_kh, r_kw]), name='kernel_pack')
     else:
         kernel_pack = kernel
 
@@ -211,7 +218,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
 
     # transform data
     r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_b')
     data_pack = te.compute((alpha, alpha, CI, D, P), lambda eps, nu, ci, d, p:
                            te.sum(input_tile[ci][d][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
                                   axis=[r_a, r_b]), name='data_pack')
@@ -226,7 +233,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
 
     # inverse transform
     r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_b')
     inverse = te.compute((CO, out_depth, P, m, m), lambda co, d, p, vh, vw:
                          te.sum(bgemm[r_a][r_b][co][d][p] * A[r_a][vh] * A[r_b][vw],
                                 axis=[r_a, r_b]), name='inverse')
@@ -238,7 +245,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
                                 n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
                                 idxmod(h, m),
                                 idxmod(w, m)],
-                        name='output', tag='conv2d_nchw_winograd')
+                        name='output', tag='conv3d_ncdhw_winograd_without_depth')
     cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
 
     return output
@@ -379,8 +386,8 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     m = alpha - 3 + 1
     n, co, d, h, w = s[output].op.axis
     do, di = s[output].split(d, m)
-    ho, hi = s[output].split(w, m)
-    wo, wi = s[output].split(h, m)
+    ho, hi = s[output].split(h, m)
+    wo, wi = s[output].split(w, m)
     s[output].reorder(n, co, do, ho, wo, di, hi, wi)
     inverse_scope, n = s[output].split(n, nparts=1)
 
@@ -404,16 +411,172 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     return s
 
 
-@autotvm.register_topi_compute("conv3d_ncdhw_winograd.cuda")
-def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return winograd_cuda(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
+def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
+    """Schedule winograd template"""
+    # get stages
+    inverse = s[output].op.input_tensors[0]
+    bgemm, A = s[inverse].op.input_tensors
+    kernel_pack, data_pack = s[bgemm].op.input_tensors
+    input_tile, B = s[data_pack].op.input_tensors
+    pad_data = s[input_tile].op.input_tensors[0]
+
+    # data transform
+    s[B].compute_inline()
+
+    data_l = s.cache_write(data_pack, 'local')
+    eps, nu, c, d, p = s[data_l].op.axis
+    r_a, r_b = s[data_l].op.reduce_axis
+    for axis in [eps, nu, r_a, r_b]:
+        s[data_l].unroll(axis)
+
+    eps, nu, c, d, p = s[data_pack].op.axis
+    p, pi = s[data_pack].split(p, 1)
+    fused = s[data_pack].fuse(c, d, p)
+    bb, tt = s[data_pack].split(fused, 128)
+    s[data_pack].reorder(bb, tt, pi, eps, nu)
+    s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
+    s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
+
+    s[data_l].compute_at(s[data_pack], pi)
+    s[input_tile].compute_at(s[data_pack], pi)
+    s[pad_data].compute_inline()
+
+    # transform kernel
+    if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
+        kernel, G = s[kernel_pack].op.input_tensors
+        eps, nu, kd, ci, co = s[kernel_pack].op.axis
+        s[G].compute_inline()
+        r_a, r_b = s[kernel_pack].op.reduce_axis
+        for axis in [eps, nu, r_a, r_b]:
+            s[kernel_pack].unroll(axis)
+
+        fused = s[kernel_pack].fuse(kd, ci, co)
+        bb, tt = s[kernel_pack].split(fused, 128)
+        s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b)
+        s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
+        s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
+    else:
+        kernel = kernel_pack
+
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    ##### space definition begin #####
+    b1, b2, y1, y2, x = s[bgemm].op.axis
+    # Combine channel and depth axes.
+    y = s[bgemm].fuse(y1, y2)
+    rc = s[bgemm].op.reduce_axis[0]
+    alpha = get_const_int(b1.dom.extent)
+
+    cfg.define_split("tile_b", cfg.axis(alpha * alpha), num_outputs=4,
+                     filter=lambda x: x.size[-3:] == [1, 1, 1])
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
+    target = tvm.target.Target.current()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
+    # batch gemm
+    C = bgemm
+    A0, B0 = kernel_pack, data_pack
+
+    OL = s.cache_write(C, 'local')
+    AA = s.cache_read(A0, 'shared', [OL])
+    BB = s.cache_read(B0, 'shared', [OL])
+
+    b = s[bgemm].fuse(b1, b2)
 
+    # tile and bind spatial axes
+    bgemm_scope, b = s[bgemm].split(b, nparts=1)
+    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].bind(bz, te.thread_axis("blockIdx.z"))
+    s[C].bind(by, te.thread_axis("blockIdx.y"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(vz, te.thread_axis("vthread"))
+    s[C].bind(vy, te.thread_axis("vthread"))
+    s[C].bind(vx, te.thread_axis("vthread"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
+
+    # tile reduction axes
+    s[OL].compute_at(s[C], tx)
+    b1, b2, y1, y2, x = s[OL].op.axis
+    y = s[OL].fuse(y1, y2)
+    b = s[OL].fuse(b1, b2)
+    rc, = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    s[OL].reorder(rco, rci, b, y, x)
 
-@autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_depth.cuda")
-def conv3d_ncdhw_winograd_without_depth(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    return winograd_without_depth_cuda(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
+    s[AA].compute_at(s[OL], rco)
+    s[BB].compute_at(s[OL], rco)
+
+    # cooperative fetching
+    for load in [AA, BB]:
+        fused = s[load].fuse(*list(s[load].op.axis))
+        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
+        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
+        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+
+    s[C].pragma(bgemm_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[C].pragma(bgemm_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    # schedule inverse, output and fusion
+    if output.op in s.outputs:
+        OL = None
+    else:
+        OL = output
+        s[OL].set_scope('local')
+        output = s.outputs[0]
+
+    m = alpha - 3 + 1
+    n, co, d, h, w = s[output].op.axis
+    do, di = s[output].split(d, m)
+    ho, hi = s[output].split(h, m)
+    wo, wi = s[output].split(w, m)
+    s[output].reorder(n, co, do, ho, wo, di, hi, wi)
+    inverse_scope, n = s[output].split(n, nparts=1)
+
+    fused = s[output].fuse(n, co, do, ho, wo)
+    bb, tt = s[output].split(fused, 128)
+
+    s[output].bind(bb, te.thread_axis("blockIdx.x"))
+    s[output].bind(tt, te.thread_axis("threadIdx.x"))
+
+    if OL is not None:
+        s[OL].compute_at(s[output], tt)
+
+    s[A].compute_inline()
+    co, d, p, vh, vw = s[inverse].op.axis
+    r_a, r_b = s[inverse].op.reduce_axis
+    for axis in [vh, vw, r_a, r_b]:
+        s[inverse].unroll(axis)
+    s[inverse].compute_at(s[output], tt)
+
+    return s
+
+
+@autotvm.register_topi_compute("conv3d_ncdhw_winograd.cuda")
+def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
+    # Check if we can transform depth.
+    if 2 < KD < 8 and KD == KH:
+        return winograd_cuda(
+            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
+    else:
+        return winograd_without_depth_cuda(
+            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
 
 
 @autotvm.register_topi_schedule("conv3d_ncdhw_winograd.cuda")
@@ -421,7 +584,9 @@ def schedule_conv3d_ncdhw_winograd(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if 'conv3d_ncdhw_winograd' in op.tag:
+        if 'conv3d_ncdhw_winograd_without_depth' in op.tag:
+            schedule_winograd_no_depth_cuda(cfg, s, op.output(0), pre_computed=False)
+        elif 'conv3d_ncdhw_winograd' in op.tag:
             schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
 
     traverse_inline(s, outs[0].op, _callback)
@@ -431,8 +596,14 @@ def _callback(op):
 @autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_weight_transform.cuda")
 def conv3d_ncdhw_winograd_without_weight_transform(cfg, data, kernel, strides, padding, dilation,
                                                    out_dtype):
-    return winograd_cuda(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
+    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
+    # Check if we can transform depth.
+    if 2 < KD < 8 and KD == KH:
+        return winograd_cuda(
+            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
+    else:
+        return winograd_without_depth_cuda(
+            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
 
 
 @autotvm.register_topi_schedule("conv3d_ncdhw_winograd_without_weight_transform.cuda")
@@ -441,7 +612,9 @@ def schedule_conv3d_ncdhw_winograd_without_weight_transform(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if 'conv3d_ncdhw_winograd' in op.tag:
+        if 'conv3d_ncdhw_winograd_without_depth' in op.tag:
+            schedule_winograd_no_depth_cuda(cfg, s, op.output(0), pre_computed=True)
+        elif 'conv3d_ncdhw_winograd' in op.tag:
             schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
 
     traverse_inline(s, outs[0].op, _callback)
diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py
index 409db3ee2449..388058a14a35 100644
--- a/topi/python/topi/nn/conv3d.py
+++ b/topi/python/topi/nn/conv3d.py
@@ -179,20 +179,36 @@ def conv3d_winograd_weight_transform(kernel, tile_size):
         5-D with shape [alpha, alpha, alpha, CO, CI]
     """
     CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
-    assert KD == KH == KW, "Only support NxNxN kernel"
 
-    r = tile_size + KD - 1
-    shape = (r, r, r) + (CI, CO)
+    depth_transform = 2 < KD < 8
 
-    _, _, G = winograd_transform_matrices(tile_size, KD, kernel.dtype)
+    if depth_transform:
+        assert KD == KH == KW, "Only support NxNxN kernel"
+    else:
+        assert KH == KW, "Only supports DxNxN kernel"
+
+    r = tile_size + KH - 1
 
-    r_kd = te.reduce_axis((0, KD), name='r_kd')
     r_kh = te.reduce_axis((0, KH), name='r_kh')
     r_kw = te.reduce_axis((0, KW), name='r_kw')
-    return te.compute(shape, lambda omg, eps, nu, ci, co:
-                      te.sum(kernel[co][ci][r_kd][r_kh][r_kw] *
-                             G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
-                             axis=[r_kd, r_kh, r_kw]), name='transform_weight')
+    _, _, G = winograd_transform_matrices(tile_size, KD, kernel.dtype)
+    if depth_transform:
+        shape = (r, r, r, CI, CO)
+        r_kd = te.reduce_axis((0, KD), name='r_kd')
+        return te.compute(
+            shape,
+            lambda omg, eps, nu, ci, co: te.sum(
+                kernel[co][ci][r_kd][r_kh][r_kw] * G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
+                axis=[r_kd, r_kh, r_kw]),
+            name='transform_weight')
+    else:
+        shape = (r, r, KD, CI, CO)
+        return te.compute(
+            shape,
+            lambda eps, nu, d, ci, co: te.sum(
+                kernel[co][ci][d][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]),
+            name='transform_weight')
+
 
 
 @tvm.target.generic_func
diff --git a/topi/tests/python/test_topi_conv3d_winograd.py b/topi/tests/python/test_topi_conv3d_winograd.py
index 89d8e269c2af..ed66a7871136 100644
--- a/topi/tests/python/test_topi_conv3d_winograd.py
+++ b/topi/tests/python/test_topi_conv3d_winograd.py
@@ -37,22 +37,23 @@ def verify_conv3d_ncdhw(batch,
                         in_channel,
                         in_size,
                         num_filter,
-                        kernel,
+                        depth_kernel,
+                        space_kernel,
                         stride,
                         padding,
                         dilation=1,
                         add_bias=False,
                         add_relu=False):
     pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(
-        padding, (kernel, kernel, kernel))
+        padding, (depth_kernel, space_kernel, space_kernel))
     padding_sum = pad_front + pad_back + pad_top + pad_left + pad_bottom + pad_right
     print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
-          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
+          (batch, in_channel, in_size, num_filter, space_kernel, stride, padding_sum, dilation))
 
     in_depth = in_height = in_width = in_size
 
     A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A')
-    W = te.placeholder((num_filter, in_channel, kernel, kernel, kernel), name='W')
+    W = te.placeholder((num_filter, in_channel, depth_kernel, space_kernel, space_kernel), name='W')
     bias = te.placeholder((num_filter, 1, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
@@ -100,14 +101,14 @@ def check_device(device):
                 s, [A, W, bias, C],
                 device,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
-                (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
+                (batch, in_channel, in_size, num_filter, space_kernel, stride, padding_sum, dilation))
             func(a, w, b, c)
         else:
             func = tvm.build(
                 s, [A, W, C],
                 device,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
-                (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
+                (batch, in_channel, in_size, num_filter, space_kernel, stride, padding_sum, dilation))
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
 
@@ -118,25 +119,25 @@ def check_device(device):
 
 def test_conv3d_ncdhw():
     #3DCNN  workloads
-    verify_conv3d_ncdhw(1, 61, 20, 120, 3, 1, 0)
-    verify_conv3d_ncdhw(1, 61, 20, 120, 5, 1, 2)
-    verify_conv3d_ncdhw(1, 61, 20, 120, 7, 1, 3)
-    verify_conv3d_ncdhw(1, 128, 12, 256, 3, 1, 1)
-    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 3, 3, 1, 0)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 5, 5, 1, 2)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 7, 7, 1, 3)
+    verify_conv3d_ncdhw(1, 128, 12, 256, 3, 3, 1, 1)
+    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1)
 
     # bias, relu
-    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, add_relu=True)
-    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 1, 1, add_relu=True, add_bias=True)
+    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1, add_relu=True)
+    verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1, add_relu=True, add_bias=True)
 
     # dilation = 2
-    verify_conv3d_ncdhw(1, 16, 12, 16, 3, 1, "VALID", dilation=2)
+    verify_conv3d_ncdhw(1, 16, 12, 16, 3, 3, 1, "VALID", dilation=2)
 
     # batch size
-    verify_conv3d_ncdhw(4, 32, 12, 64, 3, 1, 1)
+    verify_conv3d_ncdhw(4, 32, 12, 64, 3, 3, 1, 1)
 
     # weird workloads
-    verify_conv3d_ncdhw(2, 2, 2, 2, 3, 1, 2)
-    verify_conv3d_ncdhw(3, 3, 3, 3, 3, 1, 3)
+    verify_conv3d_ncdhw(2, 2, 2, 2, 3, 3, 1, 2)
+    verify_conv3d_ncdhw(3, 3, 3, 3, 3, 3, 1, 3)
 
 
 if __name__ == "__main__":

From a05fb583ed8c223d1fd0022ce8a37bbfa6ca8b57 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Thu, 2 Apr 2020 18:39:26 -0700
Subject: [PATCH 18/28] Fix no_depth schedule bugs.

---
 topi/python/topi/cuda/conv3d_winograd.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 150ed1e6d6d4..91ab356bfc89 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -462,10 +462,10 @@ def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
         s[kernel].compute_inline()
 
     ##### space definition begin #####
-    b1, b2, y1, y2, x = s[bgemm].op.axis
+    b1, b2, z, y, x = s[bgemm].op.axis
     # Combine channel and depth axes.
-    y = s[bgemm].fuse(y1, y2)
     rc = s[bgemm].op.reduce_axis[0]
+    rz = s[bgemm].op.reduce_axis[1]
     alpha = get_const_int(b1.dom.extent)
 
     cfg.define_split("tile_b", cfg.axis(alpha * alpha), num_outputs=4,
@@ -473,6 +473,7 @@ def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
     cfg.define_split("tile_y", y, num_outputs=4)
     cfg.define_split("tile_x", x, num_outputs=4)
     cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_split("tile_rz", rz, num_outputs=2)
     cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
     target = tvm.target.Target.current()
     if target.target_name in ['nvptx', 'rocm']:
@@ -490,6 +491,7 @@ def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
     BB = s.cache_read(B0, 'shared', [OL])
 
     b = s[bgemm].fuse(b1, b2)
+    y = s[bgemm].fuse(z, y)
 
     # tile and bind spatial axes
     bgemm_scope, b = s[bgemm].split(b, nparts=1)
@@ -512,9 +514,10 @@ def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
     b1, b2, y1, y2, x = s[OL].op.axis
     y = s[OL].fuse(y1, y2)
     b = s[OL].fuse(b1, b2)
-    rc, = s[OL].op.reduce_axis
+    rc, rz = s[OL].op.reduce_axis
     rco, rci = cfg['tile_rc'].apply(s, OL, rc)
-    s[OL].reorder(rco, rci, b, y, x)
+    rzo, rzi = cfg['tile_rz'].apply(s, OL, rz)
+    s[OL].reorder(rco, rzo, rci, rzi, b, y, x)
 
     s[AA].compute_at(s[OL], rco)
     s[BB].compute_at(s[OL], rco)

From 0a34c18aa8e588dea1202a6806f23df38b6681a1 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 3 Apr 2020 13:23:37 -0700
Subject: [PATCH 19/28] automatic topi switching between depth and depthless
 winograd.

---
 python/tvm/relay/op/strategy/cuda.py           |  6 +++---
 topi/python/topi/cuda/conv3d_alter_op.py       | 12 +++++++++---
 topi/python/topi/generic/nn.py                 | 18 +++++++++++++-----
 topi/python/topi/nn/conv3d.py                  |  4 ++--
 topi/tests/python/test_topi_conv3d_winograd.py |  7 +++++++
 5 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 55dc9250bbb8..cf412194a0d6 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -244,9 +244,9 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                                     name="conv3d_ncdhw.cuda",
                                     plevel=10)
         _, _, kd, kh, kw = get_const_tuple(kernel.shape)
-        if 2 < kd < 8 and 2 < kh < 8 and 2 < kw < 8 and kd == kh and kh == kw and \
-            stride_d == 1 and stride_h == 1 and stride_w == 1 and \
-            dilation_d == 1 and dilation_h == 1 and dilation_w == 1:
+        if 2 < kh < 8 and 2 < kw < 8 and kh == kw and \
+            stride_h == 1 and stride_w == 1 and \
+            dilation_h == 1 and dilation_w == 1:
             strategy.add_implementation(
                 wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd),
                 wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd),
diff --git a/topi/python/topi/cuda/conv3d_alter_op.py b/topi/python/topi/cuda/conv3d_alter_op.py
index 395612dd7e31..57fa9b6c3a04 100644
--- a/topi/python/topi/cuda/conv3d_alter_op.py
+++ b/topi/python/topi/cuda/conv3d_alter_op.py
@@ -76,9 +76,15 @@ def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
 
         # Store the same config for the altered operators (workload)
         new_data = data
-        new_weight = te.placeholder(
-            (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CI, CO),
-            dtype=kernel.dtype)
+        # Check if depth is transformed or not
+        if 2 < KD < 8 and KD == KH:
+            new_weight = te.placeholder(
+                (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CI, CO),
+                dtype=kernel.dtype)
+        else:
+            new_weight = te.placeholder(
+                (KH + tile_size - 1, KW + tile_size - 1, KD, CI, CO),
+                dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_weight, strides, padding, dilation, out_dtype],
             "conv3d_ncdhw_winograd_without_weight_transform.cuda")
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 0d78bac0133a..2be4bbb456de 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -207,11 +207,19 @@ def schedule_conv3d_winograd_weight_transform(outs):
     output = outs[0]
     _, G = s[output].op.input_tensors
     s[G].compute_inline()
-    omg, eps, nu, co, ci = s[output].op.axis
-    r_kd, r_kh, r_kw = s[output].op.reduce_axis
-    s[output].reorder(co, ci, omg, eps, nu, r_kd, r_kh, r_kw)
-    for axis in [r_kd, r_kh, r_kw]:
-        s[output].unroll(axis)
+    transform_depth = len(s[output].op.reduce_axis) == 3
+    if transform_depth:
+        omg, eps, nu, ci, co = s[output].op.axis
+        r_kd, r_kh, r_kw = s[output].op.reduce_axis
+        s[output].reorder(co, ci, omg, eps, nu, r_kd, r_kh, r_kw)
+        for axis in [r_kd, r_kh, r_kw]:
+            s[output].unroll(axis)
+    else:
+        eps, nu, d, ci, co = s[output].op.axis
+        r_kh, r_kw = s[output].op.reduce_axis
+        s[output].reorder(co, ci, d, eps, nu, r_kh, r_kw)
+        for axis in [r_kh, r_kw]:
+            s[output].unroll(axis)
     s[output].parallel(co)
     return s
 
diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py
index 388058a14a35..cc2e0bf86624 100644
--- a/topi/python/topi/nn/conv3d.py
+++ b/topi/python/topi/nn/conv3d.py
@@ -180,7 +180,7 @@ def conv3d_winograd_weight_transform(kernel, tile_size):
     """
     CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
 
-    depth_transform = 2 < KD < 8
+    depth_transform = 2 < KD < 8 and KD == KH
 
     if depth_transform:
         assert KD == KH == KW, "Only support NxNxN kernel"
@@ -191,7 +191,7 @@ def conv3d_winograd_weight_transform(kernel, tile_size):
 
     r_kh = te.reduce_axis((0, KH), name='r_kh')
     r_kw = te.reduce_axis((0, KW), name='r_kw')
-    _, _, G = winograd_transform_matrices(tile_size, KD, kernel.dtype)
+    _, _, G = winograd_transform_matrices(tile_size, KH, kernel.dtype)
     if depth_transform:
         shape = (r, r, r, CI, CO)
         r_kd = te.reduce_axis((0, KD), name='r_kd')
diff --git a/topi/tests/python/test_topi_conv3d_winograd.py b/topi/tests/python/test_topi_conv3d_winograd.py
index ed66a7871136..6d0d99d00b10 100644
--- a/topi/tests/python/test_topi_conv3d_winograd.py
+++ b/topi/tests/python/test_topi_conv3d_winograd.py
@@ -118,9 +118,13 @@ def check_device(device):
 
 
 def test_conv3d_ncdhw():
+    # Try without depth transformation
     #3DCNN  workloads
     verify_conv3d_ncdhw(1, 61, 20, 120, 3, 3, 1, 0)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 1, 3, 1, 0)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 5, 3, 1, 0)
     verify_conv3d_ncdhw(1, 61, 20, 120, 5, 5, 1, 2)
+    verify_conv3d_ncdhw(1, 61, 20, 120, 1, 5, 1, 2)
     verify_conv3d_ncdhw(1, 61, 20, 120, 7, 7, 1, 3)
     verify_conv3d_ncdhw(1, 128, 12, 256, 3, 3, 1, 1)
     verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1)
@@ -128,12 +132,15 @@ def test_conv3d_ncdhw():
     # bias, relu
     verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1, add_relu=True)
     verify_conv3d_ncdhw(1, 64, 12, 128, 3, 3, 1, 1, add_relu=True, add_bias=True)
+    verify_conv3d_ncdhw(1, 64, 12, 128, 1, 3, 1, 1, add_relu=True, add_bias=True)
 
     # dilation = 2
     verify_conv3d_ncdhw(1, 16, 12, 16, 3, 3, 1, "VALID", dilation=2)
+    verify_conv3d_ncdhw(1, 16, 12, 16, 1, 3, 1, "VALID", dilation=2)
 
     # batch size
     verify_conv3d_ncdhw(4, 32, 12, 64, 3, 3, 1, 1)
+    verify_conv3d_ncdhw(4, 32, 12, 64, 1, 3, 1, 1)
 
     # weird workloads
     verify_conv3d_ncdhw(2, 2, 2, 2, 3, 3, 1, 2)

From 51acc4c1f5ac5f2e201ea250aaa33b0290be98df Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 3 Apr 2020 13:48:45 -0700
Subject: [PATCH 20/28] Fixed bug in schedule.

---
 tests/python/relay/test_op_level2.py     | 9 +++++++--
 topi/python/topi/cuda/conv3d_winograd.py | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index f46dec7607f9..771a63deec69 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -596,15 +596,20 @@ def run_test_conv3d_cuda(dtype, out_dtype, scale, dshape, kshape,
     kshape = (64, 32, 3, 3, 3)
     run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
                          padding=(1, 1, 1), kernel_size=(3, 3, 3))
+    # Without depth transform using 1x3x3 kernel.
+    kshape = (64, 32, 1, 3, 3)
     run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
-                         padding=(1, 1, 1), kernel_size=(3, 3, 3), prepack=True)
+                         padding=(0, 1, 1), kernel_size=(1, 3, 3))
+
     # extended winograd: stride 1, padding N, kernel NxNxN
     dshape = (1, 61, 20, 20, 20)
     kshape = (120, 61, 5, 5, 5)
     run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
                          padding=(2, 2, 2), channels=120, kernel_size=(5, 5, 5))
+    # Without depth transform
+    kshape = (120, 61, 1, 5, 5)
     run_test_conv3d_cuda("float32", "float32", 1, dshape, kshape,
-                         padding=(2, 2, 2), kernel_size=(5, 5, 5), prepack=True)
+                         padding=(0, 2, 2), channels=120, kernel_size=(1, 5, 5))
 
 
 def test_conv2d_transpose_infer_type():
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 91ab356bfc89..ecc884cea570 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -599,9 +599,9 @@ def _callback(op):
 @autotvm.register_topi_compute("conv3d_ncdhw_winograd_without_weight_transform.cuda")
 def conv3d_ncdhw_winograd_without_weight_transform(cfg, data, kernel, strides, padding, dilation,
                                                    out_dtype):
-    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
+    A, B, C, _, _ = get_const_tuple(kernel.shape)
     # Check if we can transform depth.
-    if 2 < KD < 8 and KD == KH:
+    if A == B == C:
         return winograd_cuda(
             cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
     else:

From 1b5a7a29966cdabc1b859be4c8b8179860889da2 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 3 Apr 2020 14:02:29 -0700
Subject: [PATCH 21/28] lint fixes.

---
 python/tvm/relay/op/strategy/cuda.py     |  6 +++---
 src/relay/op/nn/convolution.h            |  2 +-
 topi/python/topi/cuda/conv3d_winograd.py | 27 +++++++++++++-----------
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index cf412194a0d6..45ee7016912e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -235,15 +235,15 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     _, kernel = inputs
     layout = attrs.data_layout
-    stride_d, stride_h, stride_w = attrs.get_int_tuple("strides")
-    dilation_d, dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    _, stride_h, stride_w = attrs.get_int_tuple("strides")
+    _, dilation_h, dilation_w = attrs.get_int_tuple("dilation")
     assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout)
     if layout == "NCDHW":
         strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw),
                                     wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw),
                                     name="conv3d_ncdhw.cuda",
                                     plevel=10)
-        _, _, kd, kh, kw = get_const_tuple(kernel.shape)
+        _, _, _, kh, kw = get_const_tuple(kernel.shape)
         if 2 < kh < 8 and 2 < kw < 8 and kh == kw and \
             stride_h == 1 and stride_w == 1 and \
             dilation_h == 1 and dilation_w == 1:
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index bdea4d99a17a..20724b556df2 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -409,7 +409,7 @@ bool Conv3DWinogradWeightTransformRel(const Array<Type>& types,
   // Shape of packed weights depends on whether depth is being transformed or not.
   Array<IndexExpr> oshape({0, 0, 0, data->shape[1], data->shape[0]});
   auto* depth_imm = data->shape[2].as<IntImmNode>();
-  bool transform_depth = (depth_imm->value > 2) and (depth_imm->value < 8);
+  bool transform_depth = (depth_imm->value > 2)&&(depth_imm->value < 8);
   if (transform_depth) {
     oshape.Set(0, param->tile_size + data->shape[2] - 1);
     oshape.Set(1, param->tile_size + data->shape[3] - 1);
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index ecc884cea570..68883a1c26f3 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -179,7 +179,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
 
     pf, pt, pl, pb, pd, pr = nn.get_pad_tuple3d(padding, (KD, KH, KW))
     data_pad = nn.pad(data, (0, 0, pf, pt, pl), (0, 0, pb, pd, pr), name="data_pad")
-    out_depth = simplify((D - KD + pf + pb ) // DSTR + 1)
+    out_depth = simplify((D - KD + pf + pb) // DSTR + 1)
     D += pf + pb
 
     r = KW
@@ -202,10 +202,11 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
         else:
             r_kh = te.reduce_axis((0, KH), name='r_kh')
             r_kw = te.reduce_axis((0, KW), name='r_kw')
-            kernel_pack = te.compute((alpha, alpha, KD, CI, CO), lambda eps, nu, d, ci, co:
-                                    te.sum(kernel[co][ci][d][r_kh][r_kw] *
-                                            G[eps][r_kh] * G[nu][r_kw],
-                                            axis=[r_kh, r_kw]), name='kernel_pack')
+            kernel_pack = te.compute(
+                (alpha, alpha, KD, CI, CO),
+                lambda eps, nu, d, ci, co: te.sum(
+                    kernel[co][ci][d][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]),
+                name='kernel_pack')
     else:
         kernel_pack = kernel
 
@@ -213,7 +214,8 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
     idxmod = tvm.tir.indexmod
     # pack input tile
     input_tile = te.compute((CI, D, P, alpha, alpha), lambda c, d, p, eps, nu:
-                            data_pad[idxdiv(p, (nH * nW))][c][d][idxmod(idxdiv(p, nW), nH) * m + eps]
+                            data_pad[idxdiv(p, (nH * nW))][c][d]
+                            [idxmod(idxdiv(p, nW), nH) * m + eps]
                             [idxmod(p, nW) * m + nu], name='d')
 
     # transform data
@@ -577,13 +579,14 @@ def conv3d_ncdhw_winograd(cfg, data, kernel, strides, padding, dilation, out_dty
     if 2 < KD < 8 and KD == KH:
         return winograd_cuda(
             cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
-    else:
-        return winograd_without_depth_cuda(
-            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
+
+    return winograd_without_depth_cuda(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False)
 
 
 @autotvm.register_topi_schedule("conv3d_ncdhw_winograd.cuda")
 def schedule_conv3d_ncdhw_winograd(cfg, outs):
+    """Dispatch to schedule approriate for conv3d winograd algorithm used."""
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
@@ -604,9 +607,9 @@ def conv3d_ncdhw_winograd_without_weight_transform(cfg, data, kernel, strides, p
     if A == B == C:
         return winograd_cuda(
             cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
-    else:
-        return winograd_without_depth_cuda(
-            cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
+
+    return winograd_without_depth_cuda(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True)
 
 
 @autotvm.register_topi_schedule("conv3d_ncdhw_winograd_without_weight_transform.cuda")

From 5d2aaa95a48fa74e6928edc1293dc755756f60ee Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 3 Apr 2020 14:07:54 -0700
Subject: [PATCH 22/28] Removed indents in convolution.cc

---
 src/relay/op/nn/convolution.cc | 88 +++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 019da3279a57..c493c55d3880 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -348,14 +348,14 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
                 out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<Conv2DTransposeAttrs>()
-    .set_num_inputs(2)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(2)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   ConvInferCorrectLayout<Conv2DTransposeAttrs>)
-    .add_type_rel("Conv2DTranspose", Conv2DTransposeRel<Conv2DTransposeAttrs>);
+.set_attrs_type<Conv2DTransposeAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                ConvInferCorrectLayout<Conv2DTransposeAttrs>)
+.add_type_rel("Conv2DTranspose", Conv2DTransposeRel<Conv2DTransposeAttrs>);
 
 // relay.nn.conv1d_transpose
 TVM_REGISTER_NODE_TYPE(Conv1DTransposeAttrs);
@@ -401,12 +401,12 @@ said convolution.
                 out_width = (width-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<Conv1DTransposeAttrs>()
-    .set_num_inputs(2)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(2)
-    .add_type_rel("Conv1DTranspose", Conv1DTransposeRel<Conv1DTransposeAttrs>);
+.set_attrs_type<Conv1DTransposeAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.add_type_rel("Conv1DTranspose", Conv1DTransposeRel<Conv1DTransposeAttrs>);
 
 // relay.nn.contrib_conv2d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs);
@@ -464,19 +464,19 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_weight_transform"
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
-    .describe(R"code(Weight transformation of winograd fast convolution algorithm.
+.describe(R"code(Weight transformation of winograd fast convolution algorithm.
 
 Separate this into another operator in order to enable Precompute Pass to compute the
 weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<ConvWinogradWeightTransformAttrs>()
-    .set_num_inputs(1)
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(10)
-    .add_type_rel("Conv2DWinogradWeightTransform",
-                  Conv2DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
+.set_attrs_type<ConvWinogradWeightTransformAttrs>()
+.set_num_inputs(1)
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv2DWinogradWeightTransform",
+              Conv2DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
 
 // relay.nn.contrib_conv3d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv3DWinogradAttrs);
@@ -502,9 +502,9 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_without_weight_tr
 });
 
 RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
-    .describe(R"code(Compute conv3d with winograd algorithm. Only supports NCDHW layout.
-                 This operator assumes the weight tensor is already pre-transformed by
-                 nn.contrib_conv3d_winograd_weight_transform.
+.describe(R"code(Compute conv3d with winograd algorithm. Only supports NCDHW layout.
+              This operator assumes the weight tensor is already pre-transformed by
+              nn.contrib_conv3d_winograd_weight_transform.
 
 - **data**: Input is 5D array of shape  (batch_size, in_channels, depth, height, width)
 - **weight**: Any shape
@@ -513,14 +513,14 @@ RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
 
 - **out**:  Output is 5D array of shape (batch_size, channels, depth, out_height, out_width)
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<Conv3DWinogradAttrs>()
-    .set_num_inputs(2)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(10)
-    .add_type_rel("Conv3DWinograd", Conv3DWinogradRel<Conv3DWinogradAttrs>)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   ConvInferCorrectLayout<Conv3DWinogradAttrs>);
+.set_attrs_type<Conv3DWinogradAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv3DWinograd", Conv3DWinogradRel<Conv3DWinogradAttrs>)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                ConvInferCorrectLayout<Conv3DWinogradAttrs>);
 
 // relay.nn.contrib_conv3d_winograd_weight_transform
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform")
@@ -538,12 +538,12 @@ weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2])
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<ConvWinogradWeightTransformAttrs>()
-    .set_num_inputs(1)
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(10)
-    .add_type_rel("Conv3DWinogradWeightTransform",
-                  Conv3DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
+.set_attrs_type<ConvWinogradWeightTransformAttrs>()
+.set_num_inputs(1)
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv3DWinogradWeightTransform",
+              Conv3DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
 
 
 // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
@@ -570,12 +570,12 @@ weight transformation in advance.
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<Conv2DWinogradNNPACKWeightTransformAttrs>()
-    .set_num_inputs(1)
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(10)
-    .add_type_rel("Conv2DWinogradNNPACKWeightTransform",
-                  Conv2DWinogradNNPACKWeightTransformRel<Conv2DWinogradNNPACKWeightTransformAttrs>);
+.set_attrs_type<Conv2DWinogradNNPACKWeightTransformAttrs>()
+.set_num_inputs(1)
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv2DWinogradNNPACKWeightTransform",
+              Conv2DWinogradNNPACKWeightTransformRel<Conv2DWinogradNNPACKWeightTransformAttrs>);
 
 
 // Positional relay function to create conv2d NCHWc operator

From 3d8e7a2bf72b4e02875601ffe6123f1174cca9fd Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 3 Apr 2020 14:11:05 -0700
Subject: [PATCH 23/28] missed a few indents oops.

---
 src/relay/op/nn/convolution.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index c493c55d3880..efb88f81af50 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -381,7 +381,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.conv1d_transpose")
 });
 
 RELAY_REGISTER_OP("nn.conv1d_transpose")
-    .describe(R"code(Transposed 1D convolution layer (sometimes called Deconvolution).
+.describe(R"code(Transposed 1D convolution layer (sometimes called Deconvolution).
 
 The need for transposed convolutions generally arises
 from the desire to use a transformation going in the opposite direction
@@ -563,7 +563,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_nnpack_weight_tra
 .set_body_typed(MakeConv2DWinogradNNPACKWeightTransform);
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_weight_transform")
-    .describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK.
+.describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK.
 Separate this into another symbol in order to enable Precompute Pass to compute the
 weight transformation in advance.
 

From a92b30efe4f715a8745c8477db2c4c9597591bad Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 3 Apr 2020 17:27:51 -0700
Subject: [PATCH 24/28] fixed flop count.

---
 topi/python/topi/cuda/conv3d_winograd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 68883a1c26f3..3cb63d4cd756 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -248,7 +248,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
                                 idxmod(h, m),
                                 idxmod(w, m)],
                         name='output', tag='conv3d_ncdhw_winograd_without_depth')
-    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
+    cfg.add_flop(2 * N * CO * D * H * W * CI * KH * KW)
 
     return output
 

From d965f36336750ad116e4e60a354f384f8d1ab623 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 3 Apr 2020 17:28:57 -0700
Subject: [PATCH 25/28] One more small tweak.

---
 topi/python/topi/cuda/conv3d_winograd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 3cb63d4cd756..b8374cc8a5a9 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -248,7 +248,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
                                 idxmod(h, m),
                                 idxmod(w, m)],
                         name='output', tag='conv3d_ncdhw_winograd_without_depth')
-    cfg.add_flop(2 * N * CO * D * H * W * CI * KH * KW)
+    cfg.add_flop(2 * N * CO * D * H * W * CI * KD * KH * KW)
 
     return output
 

From 9827bf9f958a6eef9921cac0c45a1a7fde522ba6 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Sat, 4 Apr 2020 21:40:26 +0000
Subject: [PATCH 26/28] Change kernel pack inner axes order.

---
 src/relay/op/nn/convolution.h            |  2 +-
 topi/python/topi/cuda/conv3d_alter_op.py |  4 ++--
 topi/python/topi/cuda/conv3d_winograd.py | 28 ++++++++++++------------
 topi/python/topi/nn/conv3d.py            |  8 +++----
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 20724b556df2..ee9ad9c9527b 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -407,7 +407,7 @@ bool Conv3DWinogradWeightTransformRel(const Array<Type>& types,
   CHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
 
   // Shape of packed weights depends on whether depth is being transformed or not.
-  Array<IndexExpr> oshape({0, 0, 0, data->shape[1], data->shape[0]});
+  Array<IndexExpr> oshape({0, 0, 0, data->shape[0], data->shape[1]});
   auto* depth_imm = data->shape[2].as<IntImmNode>();
   bool transform_depth = (depth_imm->value > 2)&&(depth_imm->value < 8);
   if (transform_depth) {
diff --git a/topi/python/topi/cuda/conv3d_alter_op.py b/topi/python/topi/cuda/conv3d_alter_op.py
index 57fa9b6c3a04..fbda45682352 100644
--- a/topi/python/topi/cuda/conv3d_alter_op.py
+++ b/topi/python/topi/cuda/conv3d_alter_op.py
@@ -79,11 +79,11 @@ def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
         # Check if depth is transformed or not
         if 2 < KD < 8 and KD == KH:
             new_weight = te.placeholder(
-                (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CI, CO),
+                (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CO, CI),
                 dtype=kernel.dtype)
         else:
             new_weight = te.placeholder(
-                (KH + tile_size - 1, KW + tile_size - 1, KD, CI, CO),
+                (KH + tile_size - 1, KW + tile_size - 1, KD, CO, CI),
                 dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_weight, strides, padding, dilation, out_dtype],
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index b8374cc8a5a9..2d67c15dc3fe 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -58,7 +58,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
     else:
         # kernel tensor is pre-transfomred. this op is created by alter op layout.
         # dilation is not supported
-        alpha, _, _, CI, CO = get_const_tuple(kernel.shape)
+        alpha, _, _, CO, CI = get_const_tuple(kernel.shape)
         KD = KH = KW = alpha + 1 - tile_size
         assert DSTR == 1 and HSTR == 1 and WSTR == 1 and \
                dilation_d == 1 and dilation_h == 1 and dilation_w == 1
@@ -81,7 +81,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
         # Check if we are currently tuning, if so we want to avoid counting
         # prepacking in time costs. Just use a placeholder with the packed shape instead.
         if autotvm.GLOBAL_SCOPE.in_tuning:
-            kernel_pack = te.placeholder((alpha, alpha, alpha, CI, CO),
+            kernel_pack = te.placeholder((alpha, alpha, alpha, CO, CI),
                                          dtype=kernel.dtype,
                                          name='kernel_pack')
         else:
@@ -89,8 +89,8 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
             r_kh = te.reduce_axis((0, KH), name='r_kh')
             r_kw = te.reduce_axis((0, KW), name='r_kw')
             kernel_pack = te.compute(
-                (alpha, alpha, alpha, CI, CO),
-                lambda omg, eps, nu, ci, co: te.sum(
+                (alpha, alpha, alpha, CO, CI),
+                lambda omg, eps, nu, co, ci: te.sum(
                     kernel[co][ci][r_kd][r_kh][r_kw] * G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
                     axis=[r_kd, r_kh, r_kw]),
                 name='kernel_pack')
@@ -124,7 +124,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
     bgemm = te.compute(
         (alpha, alpha, alpha, CO, P),
         lambda omg, eps, nu, co, p: te.sum(
-            kernel_pack[omg][eps][nu][ci][co] * data_pack[omg][eps][nu][ci][p], axis=[ci]),
+            kernel_pack[omg][eps][nu][co][ci] * data_pack[omg][eps][nu][ci][p], axis=[ci]),
         name='bgemm')
 
     # inverse transform
@@ -173,7 +173,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
     else:
         # kernel tensor is pre-transfomred. this op is created by alter op layout.
         # dilation is not supported
-        alpha, _, KD, CI, CO = get_const_tuple(kernel.shape)
+        alpha, _, KD, CO, CI = get_const_tuple(kernel.shape)
         KH = KW = alpha + 1 - tile_size
         assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
 
@@ -196,15 +196,15 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
         # During autotuning dont count kernel packing as a time cost
         # as it will later be removed via alter_op_layout.
         if autotvm.GLOBAL_SCOPE.in_tuning:
-            kernel_pack = te.placeholder((alpha, alpha, KD, CI, CO),
+            kernel_pack = te.placeholder((alpha, alpha, KD, CO, CI),
                                          dtype=kernel.dtype,
                                          name='kernel_pack')
         else:
             r_kh = te.reduce_axis((0, KH), name='r_kh')
             r_kw = te.reduce_axis((0, KW), name='r_kw')
             kernel_pack = te.compute(
-                (alpha, alpha, KD, CI, CO),
-                lambda eps, nu, d, ci, co: te.sum(
+                (alpha, alpha, KD, CO, CI),
+                lambda eps, nu, d, co, ci: te.sum(
                     kernel[co][ci][d][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]),
                 name='kernel_pack')
     else:
@@ -229,7 +229,7 @@ def winograd_without_depth_cuda(cfg, data, kernel, strides, padding, dilation, o
     ci = te.reduce_axis((0, CI), name='ci')
     rz = te.reduce_axis((0, KD), name='rz')
     bgemm = te.compute((alpha, alpha, CO, out_depth, P), lambda eps, nu, co, d, p:
-                       te.sum(kernel_pack[eps][nu][rz][ci][co] *
+                       te.sum(kernel_pack[eps][nu][rz][co][ci] *
                               data_pack[eps][nu][ci][d * DSTR + rz][p],
                               axis=[ci, rz]), name='bgemm')
 
@@ -288,14 +288,14 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     # transform kernel
     if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
         kernel, G = s[kernel_pack].op.input_tensors
-        omg, eps, nu, ci, co = s[kernel_pack].op.axis
+        omg, eps, nu, co, ci = s[kernel_pack].op.axis
         s[G].compute_inline()
         r_a, r_b, r_c = s[kernel_pack].op.reduce_axis
         # Could add additional unrolling by omg, eps, nu in the future.
         for axis in [r_a, r_b, r_c]:
             s[kernel_pack].unroll(axis)
 
-        fused = s[kernel_pack].fuse(ci, co)
+        fused = s[kernel_pack].fuse(co, ci)
         bb, tt = s[kernel_pack].split(fused, 128)
         s[kernel_pack].reorder(bb, tt, omg, eps, nu, r_a, r_b, r_c)
         s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
@@ -446,13 +446,13 @@ def schedule_winograd_no_depth_cuda(cfg, s, output, pre_computed):
     # transform kernel
     if not pre_computed and not autotvm.GLOBAL_SCOPE.in_tuning:
         kernel, G = s[kernel_pack].op.input_tensors
-        eps, nu, kd, ci, co = s[kernel_pack].op.axis
+        eps, nu, kd, co, ci = s[kernel_pack].op.axis
         s[G].compute_inline()
         r_a, r_b = s[kernel_pack].op.reduce_axis
         for axis in [eps, nu, r_a, r_b]:
             s[kernel_pack].unroll(axis)
 
-        fused = s[kernel_pack].fuse(kd, ci, co)
+        fused = s[kernel_pack].fuse(kd, co, ci)
         bb, tt = s[kernel_pack].split(fused, 128)
         s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b)
         s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py
index cc2e0bf86624..2bac284ab401 100644
--- a/topi/python/topi/nn/conv3d.py
+++ b/topi/python/topi/nn/conv3d.py
@@ -193,19 +193,19 @@ def conv3d_winograd_weight_transform(kernel, tile_size):
     r_kw = te.reduce_axis((0, KW), name='r_kw')
     _, _, G = winograd_transform_matrices(tile_size, KH, kernel.dtype)
     if depth_transform:
-        shape = (r, r, r, CI, CO)
+        shape = (r, r, r, CO, CI)
         r_kd = te.reduce_axis((0, KD), name='r_kd')
         return te.compute(
             shape,
-            lambda omg, eps, nu, ci, co: te.sum(
+            lambda omg, eps, nu, co, ci: te.sum(
                 kernel[co][ci][r_kd][r_kh][r_kw] * G[omg][r_kd] * G[eps][r_kh] * G[nu][r_kw],
                 axis=[r_kd, r_kh, r_kw]),
             name='transform_weight')
     else:
-        shape = (r, r, KD, CI, CO)
+        shape = (r, r, KD, CO, CI)
         return te.compute(
             shape,
-            lambda eps, nu, d, ci, co: te.sum(
+            lambda eps, nu, d, co, ci: te.sum(
                 kernel[co][ci][d][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]),
             name='transform_weight')
 

From 9b19e52c244c2f80c18c11f62955ae13615804e2 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Sat, 4 Apr 2020 22:13:14 +0000
Subject: [PATCH 27/28] Style changes.

---
 include/tvm/relay/attrs/nn.h             |  2 +-
 src/relay/op/nn/convolution.cc           | 11 +++-----
 src/relay/op/nn/convolution.h            | 32 ++++++++----------------
 topi/python/topi/cuda/conv3d_winograd.py |  2 +-
 4 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 6fbaadee772f..536e4145db29 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -322,7 +322,7 @@ struct Conv3DWinogradAttrs : public tvm::AttrsNode<Conv3DWinogradAttrs> {
 
   TVM_DECLARE_ATTRS(Conv3DWinogradAttrs, "relay.attrs.Conv3DWinogradAttrs") {
     TVM_ATTR_FIELD(tile_size)
-      .describe("The tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+      .describe("The tile size of winograd. E.g. 2 for F(2x2x2, 3x3x3) and 4 for F(4x4x4, 3x3x3)");
     TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1, 1}))
         .describe("Specifies the strides of the convolution.");
     TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0, 0}))
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index efb88f81af50..66dab57fd947 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -327,7 +327,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.conv2d_transpose")
 });
 
 RELAY_REGISTER_OP("nn.conv2d_transpose")
-    .describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
+.describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
 
 The need for transposed convolutions generally arises
 from the desire to use a transformation going in the opposite direction
@@ -475,8 +475,7 @@ weight transformation in advance.
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
-.add_type_rel("Conv2DWinogradWeightTransform",
-              Conv2DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
+.add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
 
 // relay.nn.contrib_conv3d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv3DWinogradAttrs);
@@ -542,8 +541,7 @@ weight transformation in advance.
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
-.add_type_rel("Conv3DWinogradWeightTransform",
-              Conv3DWinogradWeightTransformRel<ConvWinogradWeightTransformAttrs>);
+.add_type_rel("Conv3DWinogradWeightTransform", Conv3DWinogradWeightTransformRel);
 
 
 // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
@@ -574,8 +572,7 @@ weight transformation in advance.
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
-.add_type_rel("Conv2DWinogradNNPACKWeightTransform",
-              Conv2DWinogradNNPACKWeightTransformRel<Conv2DWinogradNNPACKWeightTransformAttrs>);
+.add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
 
 
 // Positional relay function to create conv2d NCHWc operator
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index ee9ad9c9527b..f04693a72a7a 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -365,16 +365,13 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
 
 // Winograd convolution shape relations
-template<typename AttrType>
-bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
-                                      int num_inputs,
-                                      const Attrs& attrs,
-                                      const TypeReporter& reporter) {
+inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_inputs,
+                                             const Attrs& attrs, const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
-  const ConvWinogradWeightTransformAttrs* param = attrs.as<AttrType>();
+  const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
   CHECK(param != nullptr);
 
   CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
@@ -391,17 +388,13 @@ bool Conv2DWinogradWeightTransformRel(const Array<Type>& types,
   return true;
 }
 
-
-template <typename AttrType>
-bool Conv3DWinogradWeightTransformRel(const Array<Type>& types,
-                                      int num_inputs,
-                                      const Attrs& attrs,
-                                      const TypeReporter& reporter) {
+inline bool Conv3DWinogradWeightTransformRel(const Array<Type>& types, int num_inputs,
+                                             const Attrs& attrs, const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
-  const ConvWinogradWeightTransformAttrs* param = attrs.as<AttrType>();
+  const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
   CHECK(param != nullptr);
 
   CHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
@@ -424,19 +417,17 @@ bool Conv3DWinogradWeightTransformRel(const Array<Type>& types,
   return true;
 }
 
-
-template <typename AttrType>
-bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types,
-                                            int num_inputs,
-                                            const Attrs& attrs,
-                                            const TypeReporter& reporter) {
+inline bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types, int num_inputs,
+                                                   const Attrs& attrs,
+                                                   const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
 
-  const Conv2DWinogradNNPACKWeightTransformAttrs* param = attrs.as<AttrType>();
+  const Conv2DWinogradNNPACKWeightTransformAttrs* param =
+      attrs.as<Conv2DWinogradNNPACKWeightTransformAttrs>();
   CHECK(param != nullptr);
 
   CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
@@ -456,7 +447,6 @@ bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types,
   return true;
 }
 
-
 template<typename AttrType>
 bool Conv2DWinogradRel(const Array<Type>& types,
                        int num_inputs,
diff --git a/topi/python/topi/cuda/conv3d_winograd.py b/topi/python/topi/cuda/conv3d_winograd.py
index 2d67c15dc3fe..c9e84468176d 100644
--- a/topi/python/topi/cuda/conv3d_winograd.py
+++ b/topi/python/topi/cuda/conv3d_winograd.py
@@ -56,7 +56,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
         alpha = KW + tile_size - 1
         assert DSTR == 1 and HSTR == 1 and WSTR == 1 and KD == KH and KH == KW
     else:
-        # kernel tensor is pre-transfomred. this op is created by alter op layout.
+        # kernel tensor is pre-transformed. this op is created by alter op layout.
         # dilation is not supported
         alpha, _, _, CO, CI = get_const_tuple(kernel.shape)
         KD = KH = KW = alpha + 1 - tile_size

From 0f583bb20d8a1e35a8b1913989f9610b1266a4b3 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Sun, 5 Apr 2020 13:00:15 -0700
Subject: [PATCH 28/28] Comment fixes.

---
 python/tvm/relay/op/nn/nn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index eb0015921ef1..15ed10e7c667 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -328,7 +328,7 @@ def contrib_conv3d_winograd_without_weight_transform(data,
         The weight expressions.
 
     tile_size : int
-        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+        The Tile size of winograd. E.g. 2 for F(2x2x2, 3x3x3) and 4 for F(4x4x4, 3x3x3)
 
     strides : tuple of int, optional
         The strides of convolution.
@@ -2012,7 +2012,7 @@ def contrib_conv3d_winograd_weight_transform(weight,
         The weight expressions.
 
     tile_size : int
-        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+        The Tile size of winograd. E.g. 2 for F(2x2x2, 3x3x3) and 4 for F(4x4x4, 3x3x3)
 
     Returns
     -------