From ce6ea3720612917cc6ca3ba8b089800e540545a7 Mon Sep 17 00:00:00 2001
From: cchung100m <cchung100m@cs.ccu.edu.tw>
Date: Tue, 5 May 2020 23:53:35 +0800
Subject: [PATCH 1/3] [AutoTVM][TOPI] AutoTVM incorrect measurement

---
 topi/python/topi/mali/conv2d.py | 38 ++++++++++++---------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index d19592857086..30cdc609c1a7 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -138,20 +138,15 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
         s[data_vec].unroll(vw)
 
     if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel packing will be pre-computed during compilation, so we skip
-            # this part to make tuning records correct
-            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region')
-        else:
-            max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-            co, ci, kh, kw, vc = s[kernel_vec].op.axis
-            fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
-            fused, vec = s[kernel_vec].split(fused, VC)
-            bb, tt = s[kernel_vec].split(fused, max_threads)
-            s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
-            s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
-            if VC in vec_size:
-                s[kernel_vec].vectorize(vec)
+        max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
+        co, ci, kh, kw, vc = s[kernel_vec].op.axis
+        fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
+        fused, vec = s[kernel_vec].split(fused, VC)
+        bb, tt = s[kernel_vec].split(fused, max_threads)
+        s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
+        s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
+        if VC in vec_size:
+            s[kernel_vec].vectorize(vec)
 
     # schedule convolution
     n, c, h, w, vh, vw, vc = s[conv].op.axis
@@ -345,16 +340,11 @@ def _schedule_winograd(cfg, s, op):
         kernel, G = s[U].op.input_tensors
         s[G].compute_inline()
         eps, nu, co, ci, vco, = s[U].op.axis
-        if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel transformation will be pre-computed during compilation, so we skip
-            # this part to make tuning records correct
-            s[U].pragma(eps, 'debug_skip_region')
-        else:
-            r_kh, r_kw = s[U].op.reduce_axis
-            s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco)
-            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
-            s[U].vectorize(vco)
-            tile_and_bind(s, U, co, ci, 1, 256)
+        r_kh, r_kw = s[U].op.reduce_axis
+        s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco)
+        _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
+        s[U].vectorize(vco)
+        tile_and_bind(s, U, co, ci, 1, 256)
 
         # dilation
         if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:

From 64d1e6a38d5c44deb86046d46a5e0c4167b224d8 Mon Sep 17 00:00:00 2001
From: cchung100m <cchung100m@cs.ccu.edu.tw>
Date: Mon, 25 May 2020 21:05:28 +0800
Subject: [PATCH 2/3] create new placeholder with converted layout

---
 topi/python/topi/mali/conv2d.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 30cdc609c1a7..91bdd360b1b2 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -138,15 +138,22 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
         s[data_vec].unroll(vw)
 
     if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
-        max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-        co, ci, kh, kw, vc = s[kernel_vec].op.axis
-        fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
-        fused, vec = s[kernel_vec].split(fused, VC)
-        bb, tt = s[kernel_vec].split(fused, max_threads)
-        s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
-        s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
-        if VC in vec_size:
-            s[kernel_vec].vectorize(vec)
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # Directly use modified data layout placeholder.
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
+            kvshape = (co // vc, ci, kh, kw, vc)
+            kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel")
+            s[kernel_vec] = kernel_vec
+        else:
+            max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
+            fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
+            fused, vec = s[kernel_vec].split(fused, VC)
+            bb, tt = s[kernel_vec].split(fused, max_threads)
+            s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
+            s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
+            if VC in vec_size:
+                s[kernel_vec].vectorize(vec)
 
     # schedule convolution
     n, c, h, w, vh, vw, vc = s[conv].op.axis

From 1629b567270eab07d71121bf1fafd85bd3797c3e Mon Sep 17 00:00:00 2001
From: cchung100m <cchung100m@cs.ccu.edu.tw>
Date: Tue, 26 May 2020 19:01:15 +0800
Subject: [PATCH 3/3] update _schedule_winograd

---
 topi/python/topi/mali/conv2d.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 91bdd360b1b2..12eb3d7c78c8 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -138,15 +138,14 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
         s[data_vec].unroll(vw)
 
     if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
+        co, ci, kh, kw, vc = s[kernel_vec].op.axis
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # Directly use modified data layout placeholder.
-            co, ci, kh, kw, vc = s[kernel_vec].op.axis
             kvshape = (co // vc, ci, kh, kw, vc)
             kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel")
             s[kernel_vec] = kernel_vec
         else:
             max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
-            co, ci, kh, kw, vc = s[kernel_vec].op.axis
             fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
             fused, vec = s[kernel_vec].split(fused, VC)
             bb, tt = s[kernel_vec].split(fused, max_threads)
@@ -347,11 +346,12 @@ def _schedule_winograd(cfg, s, op):
         kernel, G = s[U].op.input_tensors
         s[G].compute_inline()
         eps, nu, co, ci, vco, = s[U].op.axis
-        r_kh, r_kw = s[U].op.reduce_axis
-        s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco)
-        _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
-        s[U].vectorize(vco)
-        tile_and_bind(s, U, co, ci, 1, 256)
+        if not autotvm.GLOBAL_SCOPE.in_tuning:
+            r_kh, r_kw = s[U].op.reduce_axis
+            s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco)
+            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
+            s[U].vectorize(vco)
+            tile_and_bind(s, U, co, ci, 1, 256)
 
         # dilation
         if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: