From 1499d0a40a9cac5759132e0788907452aa682617 Mon Sep 17 00:00:00 2001
From: cchung100m <cchung100m@cs.ccu.edu.tw>
Date: Thu, 28 May 2020 00:20:15 +0800
Subject: [PATCH 1/4] [AutoTVM][TOPI] Fix bifrost spatial packing conv2d auto
 tune

---
 topi/python/topi/bifrost/conv2d.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py
index 92e874afa2a5..eb1b3d14213c 100644
--- a/topi/python/topi/bifrost/conv2d.py
+++ b/topi/python/topi/bifrost/conv2d.py
@@ -142,13 +142,14 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
         s[data_vec].unroll(vw)
 
     if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
+        co, ci, kh, kw, vc = s[kernel_vec].op.axis
         if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel packing will be pre-computed during compilation, so we skip
-            # this part to make tuning records correct
-            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region')
+            # Directly use modified data layout placeholder.
+            kvshape = (co // vc, ci, kh, kw, vc)
+            kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel")
+            s[kernel_vec] = kernel_vec
         else:
             max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-            co, ci, kh, kw, vc = s[kernel_vec].op.axis
             fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
             fused, vec = s[kernel_vec].split(fused, VC)
             bb, tt = s[kernel_vec].split(fused, max_threads)
@@ -370,12 +371,7 @@ def _schedule_winograd(cfg, s, op):
         s[G].compute_inline()
         eps, _, _, _ = s[U].op.axis
         y, _, _, _ = s[padded_kernel].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # Kernel transformation will be pre-computed during compilation, so we skip
-            # this part to make tuning records correct
-            s[U].pragma(eps, 'debug_skip_region')
-            s[padded_kernel].pragma(y, 'debug_skip_region')
-        else:
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
             # Pad kernel
             y, x, ky, kx = s[padded_kernel].op.axis
             s[padded_kernel].unroll(ky)

From bf727f92b7f0c99757395e0d5e6b68be54c34448 Mon Sep 17 00:00:00 2001
From: cchung100m <cchung100m@cs.ccu.edu.tw>
Date: Sat, 30 May 2020 12:22:12 +0800
Subject: [PATCH 2/4] [AutoTVM][TOPI] Putting placeholder replacement in
 compute

---
 .../topi/arm_cpu/conv2d_spatial_pack.py       | 23 ++++++++++++-------
 topi/python/topi/bifrost/conv2d.py            |  9 ++------
 topi/python/topi/mali/conv2d.py               |  9 ++------
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index a4d7ad83b1c8..c4bcd3427ef8 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -109,12 +109,15 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation,
                               data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
                               name='data_vec')
 
-    if pre_packed:
-        kernel_vec = kernel
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        kernel_vec = tvm.te.placeholder(kvshape, kernel.dtype, name="kernel")
     else:
-        kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc:
-                                kernel[co*VC+vc][ci][kh][kw],
-                                name='kernel_vec')
+        if pre_packed:
+            kernel_vec = kernel
+        else:
+            kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc:
+                                    kernel[co*VC+vc][ci][kh][kw],
+                                    name='kernel_vec')
 
     ci = te.reduce_axis((0, CI), name='ci')
     kh = te.reduce_axis((0, KH), name='kh')
@@ -267,9 +270,13 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
         data_vec = te.compute(dvshape, lambda n, oho, owo, ohi, owi, ic:
                               data_pad[n][oho*OHI*HSTR+ohi][owo*OWI*WSTR+owi][ic],
                               name='data_vec')
-    kernel_vec = te.compute(kvshape, lambda oco, kh, kw, ic, oci: \
-                            kernel[kh][kw][ic][oco*OCI+oci],
-                            name='kernel_vec')
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        kernel_vec = tvm.te.placeholder(kvshape, kernel.dtype, name="kernel")
+    else:
+        kernel_vec = te.compute(kvshape, lambda oco, kh, kw, ic, oci: \
+                                kernel[kh][kw][ic][oco*OCI+oci],
+                                name='kernel_vec')
 
     ic = te.reduce_axis((0, IC), name='ic')
     kh = te.reduce_axis((0, KH), name='kh')
diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py
index eb1b3d14213c..82730ec7b6e5 100644
--- a/topi/python/topi/bifrost/conv2d.py
+++ b/topi/python/topi/bifrost/conv2d.py
@@ -142,14 +142,9 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
         s[data_vec].unroll(vw)
 
     if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
-        co, ci, kh, kw, vc = s[kernel_vec].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # Directly use modified data layout placeholder.
-            kvshape = (co // vc, ci, kh, kw, vc)
-            kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel")
-            s[kernel_vec] = kernel_vec
-        else:
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
             max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
             fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
             fused, vec = s[kernel_vec].split(fused, VC)
             bb, tt = s[kernel_vec].split(fused, max_threads)
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 12eb3d7c78c8..6f288c4b43fc 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -138,14 +138,9 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
         s[data_vec].unroll(vw)
 
     if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
-        co, ci, kh, kw, vc = s[kernel_vec].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # Directly use modified data layout placeholder.
-            kvshape = (co // vc, ci, kh, kw, vc)
-            kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel")
-            s[kernel_vec] = kernel_vec
-        else:
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
             max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
             fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
             fused, vec = s[kernel_vec].split(fused, VC)
             bb, tt = s[kernel_vec].split(fused, max_threads)

From f2e7fd343f55ecc0897f21d09bcf9a88421ee49f Mon Sep 17 00:00:00 2001
From: cchung100m <cchung100m@cs.ccu.edu.tw>
Date: Sun, 31 May 2020 13:51:27 +0800
Subject: [PATCH 3/4] Fix winograd kernel replacement

---
 topi/python/topi/arm_cpu/conv2d.py            | 21 ++++++++++++-------
 .../topi/arm_cpu/conv2d_spatial_pack.py       | 21 ++++++++-----------
 topi/python/topi/bifrost/conv2d.py            | 11 +++++++---
 topi/python/topi/mali/conv2d.py               | 21 ++++++++++++-------
 4 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index df63ae3e9e59..4faee42f75cc 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -167,15 +167,20 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, til
                                      idxm(b*VP + bb, nW) * m + nu],
                             name='d')
 
-    # transform kernel
-    if pre_computed:
-        U = kernel
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        VC = cfg['tile_k'].size[-1]
+        kvshape = (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC)
+        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
     else:
-        r_kh = te.reduce_axis((0, KH), 'r_kh')
-        r_kw = te.reduce_axis((0, KW), 'r_kw')
-        U = te.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk:
-                       te.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
-                              G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
+        # transform kernel
+        if pre_computed:
+            U = kernel
+        else:
+            r_kh = te.reduce_axis((0, KH), 'r_kh')
+            r_kw = te.reduce_axis((0, KW), 'r_kw')
+            U = te.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk:
+                           te.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
+                                  G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
 
     # transform image
     r_eps = te.reduce_axis((0, alpha), 'r_eps')
diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index c4bcd3427ef8..8cf8401e7a07 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -190,12 +190,8 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
     s[data_vec].parallel(h)
 
     if kernel_vec.op.name == 'kernel_vec':
-        co, _, _, _, _ = s[kernel_vec].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel packing will be pre-computed during compilation, so we skip
-            # this part to make tuning records correct
-            s[kernel_vec].pragma(co, 'debug_skip_region')
-        else:
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
+            co, _, _, _, _ = s[kernel_vec].op.axis
             s[kernel_vec].parallel(co)
     elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose':  # for conv2d transpose
         co, _, _, _, _ = s[kernel_vec].op.axis
@@ -346,12 +342,13 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
         s[kernel_vec].compute_at(s[conv], compat_axis)
         s[data_vec].compute_at(s[conv], compat_axis)
 
-    # schedule kernel pack
-    oco, kh, kw, ic, oci = kernel_vec.op.axis
-    s[kernel_vec].vectorize(oci)
-    s[kernel_vec].unroll(ic)
-    if cfg['compat'].val == 2:
-        s[kernel_vec].parallel(oco)
+    if not autotvm.GLOBAL_SCOPE.in_tuning:
+        # schedule kernel pack
+        oco, kh, kw, ic, oci = kernel_vec.op.axis
+        s[kernel_vec].vectorize(oci)
+        s[kernel_vec].unroll(ic)
+        if cfg['compat'].val == 2:
+            s[kernel_vec].parallel(oco)
 
     # schedule data pack
     if data_vec.op.name == 'data_vec_undilated':
diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py
index 82730ec7b6e5..ecc67c735a58 100644
--- a/topi/python/topi/bifrost/conv2d.py
+++ b/topi/python/topi/bifrost/conv2d.py
@@ -309,10 +309,15 @@ def upround(x, align):
                             data_pad[n][c][h][w],
                             name='d')
 
-    if pre_computed:
-        U = kernel
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        VC = cfg['tile_k'].size[-1]
+        kvshape = (KH + tile_size - 1, KW + tile_size - 1, tvm.tir.indexdiv(CO, VC), CI, VC)
+        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
     else:
-        U = _decl_winograd_kernel_transform(kernel, tile_size, G)
+        if pre_computed:
+            U = kernel
+        else:
+            U = _decl_winograd_kernel_transform(kernel, tile_size, G)
 
     # V [alpha * alpha, C, P_round)
     # Perform the image transform
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 6f288c4b43fc..d614699489e4 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -275,15 +275,20 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, til
             data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
             [(b*bnb+bb) % nW * m + nu], tvm.tir.const(0, data_pad.dtype)), name='d')
 
-    # transform kernel
-    if pre_computed:
-        U = kernel
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        VC = cfg['tile_k'].size[-1]
+        kvshape = (KH + tile_size - 1, KW + tile_size - 1, tvm.tir.indexdiv(CO, VC), CI, VC)
+        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
     else:
-        r_kh = te.reduce_axis((0, KH), 'r_kh')
-        r_kw = te.reduce_axis((0, KW), 'r_kw')
-        U = te.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
-                       te.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
-                              axis=[r_kh, r_kw]), name='U')
+        # transform kernel
+        if pre_computed:
+            U = kernel
+        else:
+            r_kh = te.reduce_axis((0, KH), 'r_kh')
+            r_kw = te.reduce_axis((0, KW), 'r_kw')
+            U = te.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
+                           te.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                                  axis=[r_kh, r_kw]), name='U')
 
     # transform image
     r_a = te.reduce_axis((0, alpha), 'r_a')

From cc8c7bcfa13864c7c7f1fbba59ed1981d9d56bfb Mon Sep 17 00:00:00 2001
From: cchung100m <cchung100m@cs.ccu.edu.tw>
Date: Sun, 31 May 2020 14:31:04 +0800
Subject: [PATCH 4/4] Fix sanity check: Line too long

---
 topi/python/topi/mali/conv2d.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index d614699489e4..ed1932674964 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -287,7 +287,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, til
             r_kh = te.reduce_axis((0, KH), 'r_kh')
             r_kw = te.reduce_axis((0, KW), 'r_kw')
             U = te.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
-                           te.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                           te.sum(kernel[co * bna + vco][ci][r_kh][r_kw] *
+                                  G[eps][r_kh] * G[nu][r_kw],
                                   axis=[r_kh, r_kw]), name='U')
 
     # transform image