From ab29a0d45829235ef6c24ea78eb4a1e03d363018 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Wed, 15 Feb 2023 16:19:28 +0000
Subject: [PATCH 1/5] [TOPI] Bugfix arm_cpu schedule_conv2d_spatial_pack_nhwc
 schedule

No changes to the compute, various bugfixes and improvements to the
corresponding NHWC schedule:

* There is currently a block that is not run as a part of tuning
trials, but gets run during compilation with tuning logs. Since a lot of
unrolling and vectorization happens there, for some conv2d operators the
extra vectorizing and unrolling results in about 18x size increase in asm
and can take around 10 minutes per operator to compile. That essentially
makes whole networks uncompilable, so remove that block.
* There is no fallback config or NHWC logs in the TopHub. So add a
fallback config. This significantly reduces the no tuning compile time,
e.g. by about 10x for mobilenet.
* The order of axis we passed to reorder_config was different to the
order that was used to define the reorder. By looking at the compute
definition and based on tuning results of whole networks, it seems to be
a bug.
* Constrain potential unrolling to OWI and OCI axis as unrolling across
OHI results in uncompilably huge code size. This change reduces the number
of unsuccessful tuning trials from about 50% to about 20%.
* Other minor tweaks.

Change-Id: I426b80154ddae96bf7b9f06e05e178eee2a8b087
---
 .../tvm/topi/arm_cpu/conv2d_spatial_pack.py   | 76 +++++++++----------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index 4eed56a22572..43fc4f413dff 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -23,6 +23,7 @@
 from .. import nn
 from ..utils import get_const_tuple
 from ..nn.utils import get_const_int, get_pad_tuple
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity
 
 
 def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
@@ -302,15 +303,25 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
     )
 
     cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [ohi, owi, oci], policy="try_unroll_vec")
+    cfg.define_annotate("ann_spatial", [owi, oci], policy="try_unroll_vec")
     # ====================================================================
 
-    OCI = cfg["tile_co"].size[-1]
+    # If there are no tuning records, use this config
+    if cfg.is_fallback:
+        cfg["tile_oh"] = SplitEntity([-1, 1])
+        cfg["tile_ow"] = SplitEntity([-1, 8])
+        cfg["tile_oc"] = SplitEntity([-1, 8])
+        cfg["ann_spatial"] = AnnotateEntity(["none", "vec"])
+        cfg["ann_reduce"] = AnnotateEntity(["none", "none"])
+        cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        cfg["compat"] = OtherOptionEntity(0)
+
+    OCI = cfg["tile_oc"].size[-1]
     OHI = cfg["tile_oh"].size[-1]
     OWI = cfg["tile_ow"].size[-1]
-    OCO = OC // OCI
+    OCO = max(1, OC // OCI)
     OHO = OH // OHI
-    OWO = OW // OWI
+    OWO = max(1, OW // OWI)
 
     kvshape = (OCO, KH, KW, IC, OCI)
     ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI)
@@ -390,32 +401,30 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
     data_vec = conv.op.input_tensors[0]
     kernel_vec = conv.op.input_tensors[1]
     data_pad = data_vec.op.input_tensors[0]
-    OHI = cfg["tile_oh"].size[-1]
+
     OWI = cfg["tile_ow"].size[-1]
-    OCI = cfg["tile_co"].size[-1]
+    OCI = cfg["tile_oc"].size[-1]
 
     # schedule unpack/output
     if output != unpack:
         s[unpack].compute_inline()
     n, oh, ow, oc = s[output].op.axis
-    oco, oci = cfg["tile_co"].apply(s, output, oc)
+    oco, oci = cfg["tile_oc"].apply(s, output, oc)
     oho, ohi = cfg["tile_oh"].apply(s, output, oh)
     owo, owi = cfg["tile_ow"].apply(s, output, ow)
     s[output].reorder(n, oho, owo, oco, ohi, owi, oci)
-    cfg["ann_spatial"].apply(
-        s, output, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg
-    )
-    cfg.define_knob("compat", [0, 1, 2])
-    if cfg["compat"].val < 2:
-        compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
-        s[conv].compute_at(s[output], compat_axis)
+    cfg["ann_spatial"].apply(s, output, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg)
+
+    cfg.define_knob("compat", [0, 1])
+    compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
+    s[conv].compute_at(s[output], compat_axis)
     paxis = s[output].fuse(n, oho)
     s[output].parallel(paxis)
 
     # schedule conv
     n, oho, owo, oco, ohi, owi, oci = s[conv].op.axis
     ic, kh, kw = s[conv].op.reduce_axis
-    cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ohi, owi, ic, oci])
+    cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci])
     cfg["ann_reduce"].apply(
         s,
         conv,
@@ -424,33 +433,22 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
         max_unroll=16,
         cfg=cfg,
     )
-    cfg["ann_spatial"].apply(
-        s, conv, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg
-    )
-    if cfg["compat"].val < 2:
-        compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
-        s[kernel_vec].compute_at(s[conv], compat_axis)
-        s[data_vec].compute_at(s[conv], compat_axis)
-
-    if not autotvm.GLOBAL_SCOPE.in_tuning:
-        # schedule kernel pack
-        oco, kh, kw, ic, oci = kernel_vec.op.axis
-        s[kernel_vec].vectorize(oci)
-        s[kernel_vec].unroll(ic)
-        if cfg["compat"].val == 2:
-            s[kernel_vec].parallel(oco)
-
-    # schedule data pack
+    cfg["ann_spatial"].apply(s, conv, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg)
+
+    # schedule data_vec, data_pad and kernel_vec
+    compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
+    s[kernel_vec].compute_at(s[conv], compat_axis)
+    s[data_vec].compute_at(s[conv], compat_axis)
+
+    # Inlining kernel vec brings a performance improvement, but the tuner seems to not
+    # like it, so inline only when we are using the fallback config
+    if cfg.is_fallback:
+        s[kernel_vec].compute_inline()
+
     if data_vec.op.name == "data_vec_undilated":
         n, oho, owo, kh, kw, ic, ohi, owi = s[data_vec].op.axis
-        s[data_vec].vectorize(owi)
-        s[data_vec].unroll(ohi)
     else:
         n, oho, owo, ohi, owi, ic = s[data_vec].op.axis
-        s[data_vec].vectorize(ic)
-        s[data_vec].unroll(owi)
-    if cfg["compat"].val == 2:
-        paxis = s[data_vec].fuse(n, oho)
-        s[data_vec].parallel(paxis)
+    s[data_pad].compute_at(s[data_vec], n)
 
     return s

From 3ca715e6c643f87849e967191cc48ba8470407eb Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Thu, 16 Feb 2023 15:16:17 +0000
Subject: [PATCH 2/5] Fix linting and bug in the default config

---
 python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index 43fc4f413dff..5a80fef045c5 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -20,10 +20,10 @@
 import tvm
 from tvm import te
 from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity
 from .. import nn
 from ..utils import get_const_tuple
 from ..nn.utils import get_const_int, get_pad_tuple
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity
 
 
 def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
@@ -308,9 +308,17 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
 
     # If there are no tuning records, use this config
     if cfg.is_fallback:
+
+        def _tile_size(axis, candidates):
+            for candidate in candidates:
+                tiles_divisible_by_candidate = axis % candidate == 0
+                if tiles_divisible_by_candidate:
+                    return candidate
+            return 1
+
         cfg["tile_oh"] = SplitEntity([-1, 1])
-        cfg["tile_ow"] = SplitEntity([-1, 8])
-        cfg["tile_oc"] = SplitEntity([-1, 8])
+        cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])])
+        cfg["tile_oc"] = SplitEntity([-1, _tile_size(OC, [8, 4])])
         cfg["ann_spatial"] = AnnotateEntity(["none", "vec"])
         cfg["ann_reduce"] = AnnotateEntity(["none", "none"])
         cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
@@ -319,9 +327,9 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
     OCI = cfg["tile_oc"].size[-1]
     OHI = cfg["tile_oh"].size[-1]
     OWI = cfg["tile_ow"].size[-1]
-    OCO = max(1, OC // OCI)
+    OCO = OC // OCI
     OHO = OH // OHI
-    OWO = max(1, OW // OWI)
+    OWO = OW // OWI
 
     kvshape = (OCO, KH, KW, IC, OCI)
     ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI)

From 4308191c6d15144ac7f934387e24214d6de0a899 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Mon, 20 Feb 2023 10:30:25 +0000
Subject: [PATCH 3/5] Reduce the minimum ws size in Arduino test and
 tile_oc->tile_co

Looks like tile_co as a name is deeply ingrained into
the codebase...
---
 python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 8 ++++----
 tests/micro/arduino/test_arduino_workflow.py   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index 5a80fef045c5..eae4f2790372 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -318,13 +318,13 @@ def _tile_size(axis, candidates):
 
         cfg["tile_oh"] = SplitEntity([-1, 1])
         cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])])
-        cfg["tile_oc"] = SplitEntity([-1, _tile_size(OC, [8, 4])])
+        cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])])
         cfg["ann_spatial"] = AnnotateEntity(["none", "vec"])
         cfg["ann_reduce"] = AnnotateEntity(["none", "none"])
         cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
         cfg["compat"] = OtherOptionEntity(0)
 
-    OCI = cfg["tile_oc"].size[-1]
+    OCI = cfg["tile_co"].size[-1]
     OHI = cfg["tile_oh"].size[-1]
     OWI = cfg["tile_ow"].size[-1]
     OCO = OC // OCI
@@ -411,13 +411,13 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
     data_pad = data_vec.op.input_tensors[0]
 
     OWI = cfg["tile_ow"].size[-1]
-    OCI = cfg["tile_oc"].size[-1]
+    OCI = cfg["tile_co"].size[-1]
 
     # schedule unpack/output
     if output != unpack:
         s[unpack].compute_inline()
     n, oh, ow, oc = s[output].op.axis
-    oco, oci = cfg["tile_oc"].apply(s, output, oc)
+    oco, oci = cfg["tile_co"].apply(s, output, oc)
     oho, ohi = cfg["tile_oh"].apply(s, output, oh)
     owo, owi = cfg["tile_ow"].apply(s, output, ow)
     s[output].reorder(n, oho, owo, oco, ohi, owi, oci)
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index 73cdd9b85d28..8c39dc4f16da 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -95,7 +95,7 @@ def test_model_platform_templating(project_dir, project):
         # TVM causes the amount of memory needed to decrease.
         workspace_size = int(workspace_size_defs[0])
         assert workspace_size < 30000
-        assert workspace_size > 10000
+        assert workspace_size > 9000
 
 
 def test_import_rerouting(project_dir, project):

From 140e0ba3b203ed953112e937a43ff9c8d6e8133a Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Tue, 21 Feb 2023 16:35:30 +0000
Subject: [PATCH 4/5] Add a comment about default config

---
 python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index eae4f2790372..a2429d1b2304 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -316,6 +316,8 @@ def _tile_size(axis, candidates):
                     return candidate
             return 1
 
+        # Tile size 8 results in efficient vectorization for these schedules.
+        # If the axis is not divisible by 8, try 4
         cfg["tile_oh"] = SplitEntity([-1, 1])
         cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])])
         cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])])

From 5a0ec43a3a39ecb7e99ff10e1bebb2c95e99d01f Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Tue, 21 Feb 2023 17:27:07 +0000
Subject: [PATCH 5/5] Lint the comment -_-

---
 python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index a2429d1b2304..78b6e4529223 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -317,7 +317,7 @@ def _tile_size(axis, candidates):
             return 1
 
         # Tile size 8 results in efficient vectorization for these schedules.
-        # If the axis is not divisible by 8, try 4
+        # If the axis is not divisible by 8, try 4
         cfg["tile_oh"] = SplitEntity([-1, 1])
         cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])])
         cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])])