diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index 4eed56a22572..78b6e4529223 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -20,6 +20,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity
 from .. import nn
 from ..utils import get_const_tuple
 from ..nn.utils import get_const_int, get_pad_tuple
@@ -302,9 +303,29 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
     )
 
     cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [ohi, owi, oci], policy="try_unroll_vec")
+    cfg.define_annotate("ann_spatial", [owi, oci], policy="try_unroll_vec")
     # ====================================================================
 
+    # If there are no tuning records, use this config
+    if cfg.is_fallback:
+
+        def _tile_size(axis, candidates):
+            for candidate in candidates:
+                tiles_divisible_by_candidate = axis % candidate == 0
+                if tiles_divisible_by_candidate:
+                    return candidate
+            return 1
+
+        # Tile size 8 results in efficient vectorization for these schedules.
+        # If the axis is not divisible by 8, try 4
+        cfg["tile_oh"] = SplitEntity([-1, 1])
+        cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])])
+        cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])])
+        cfg["ann_spatial"] = AnnotateEntity(["none", "vec"])
+        cfg["ann_reduce"] = AnnotateEntity(["none", "none"])
+        cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        cfg["compat"] = OtherOptionEntity(0)
+
     OCI = cfg["tile_co"].size[-1]
     OHI = cfg["tile_oh"].size[-1]
     OWI = cfg["tile_ow"].size[-1]
@@ -390,7 +411,7 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
     data_vec = conv.op.input_tensors[0]
     kernel_vec = conv.op.input_tensors[1]
     data_pad = data_vec.op.input_tensors[0]
-    OHI = cfg["tile_oh"].size[-1]
+
     OWI = cfg["tile_ow"].size[-1]
     OCI = cfg["tile_co"].size[-1]
 
@@ -402,20 +423,18 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
     oho, ohi = cfg["tile_oh"].apply(s, output, oh)
     owo, owi = cfg["tile_ow"].apply(s, output, ow)
     s[output].reorder(n, oho, owo, oco, ohi, owi, oci)
-    cfg["ann_spatial"].apply(
-        s, output, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg
-    )
-    cfg.define_knob("compat", [0, 1, 2])
-    if cfg["compat"].val < 2:
-        compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
-        s[conv].compute_at(s[output], compat_axis)
+    cfg["ann_spatial"].apply(s, output, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg)
+
+    cfg.define_knob("compat", [0, 1])
+    compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
+    s[conv].compute_at(s[output], compat_axis)
     paxis = s[output].fuse(n, oho)
     s[output].parallel(paxis)
 
     # schedule conv
     n, oho, owo, oco, ohi, owi, oci = s[conv].op.axis
     ic, kh, kw = s[conv].op.reduce_axis
-    cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ohi, owi, ic, oci])
+    cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci])
     cfg["ann_reduce"].apply(
         s,
         conv,
@@ -424,33 +443,22 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
         max_unroll=16,
         cfg=cfg,
     )
-    cfg["ann_spatial"].apply(
-        s, conv, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg
-    )
-    if cfg["compat"].val < 2:
-        compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
-        s[kernel_vec].compute_at(s[conv], compat_axis)
-        s[data_vec].compute_at(s[conv], compat_axis)
-
-    if not autotvm.GLOBAL_SCOPE.in_tuning:
-        # schedule kernel pack
-        oco, kh, kw, ic, oci = kernel_vec.op.axis
-        s[kernel_vec].vectorize(oci)
-        s[kernel_vec].unroll(ic)
-        if cfg["compat"].val == 2:
-            s[kernel_vec].parallel(oco)
-
-    # schedule data pack
+    cfg["ann_spatial"].apply(s, conv, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg)
+
+    # schedule data_vec, data_pad and kernel_vec
+    compat_axis = [owo, oco][cfg["compat"].val]  # pylint: disable=R1706
+    s[kernel_vec].compute_at(s[conv], compat_axis)
+    s[data_vec].compute_at(s[conv], compat_axis)
+
+    # Inlining kernel vec brings a performance improvement, but the tuner seems to not
+    # like it, so inline only when we are using the fallback config
+    if cfg.is_fallback:
+        s[kernel_vec].compute_inline()
+
     if data_vec.op.name == "data_vec_undilated":
         n, oho, owo, kh, kw, ic, ohi, owi = s[data_vec].op.axis
-        s[data_vec].vectorize(owi)
-        s[data_vec].unroll(ohi)
     else:
         n, oho, owo, ohi, owi, ic = s[data_vec].op.axis
-        s[data_vec].vectorize(ic)
-        s[data_vec].unroll(owi)
-    if cfg["compat"].val == 2:
-        paxis = s[data_vec].fuse(n, oho)
-        s[data_vec].parallel(paxis)
+    s[data_pad].compute_at(s[data_vec], n)
 
     return s
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index 73cdd9b85d28..8c39dc4f16da 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -95,7 +95,7 @@ def test_model_platform_templating(project_dir, project):
         # TVM causes the amount of memory needed to decrease.
         workspace_size = int(workspace_size_defs[0])
         assert workspace_size < 30000
-        assert workspace_size > 10000
+        assert workspace_size > 9000
 
 
 def test_import_rerouting(project_dir, project):