diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 005eae68b8b7..f3d0c1af0a85 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -127,8 +127,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 name="conv2d_hwcn.generic",
             )
         elif layout == "NHWC":
-            channels = data.shape[3]
-            if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout == "HWOI":
+            if "SMLAD" in isa and kernel_layout == "HWOI":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
index 988c3a99c059..307312076a7e 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
@@ -112,10 +112,14 @@ def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, ou
         cfg.reduce_axis(in_channels.value),
     )
 
-    assert in_channels.value % 4 == 0
     owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
     cio, cii = cfg.define_split(
-        "tile_ci", ci, policy="factors", num_outputs=2, filter=lambda x: x.size[-1] % 4 == 0
+        "tile_ci",
+        ci,
+        policy="factors",
+        num_outputs=2,
+        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
+        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
     )
     coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)
 
@@ -134,6 +138,11 @@ def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, ou
     cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
     cfg.define_knob("unroll_explicit", [0, 1])
 
+    if cfg.is_fallback:
+        cfg.fallback_split("tile_ow", [-1, out_width.value])
+        cfg.fallback_split("tile_ci", [-1, in_channels.value])
+        cfg.fallback_split("tile_co", [-1, out_channels.value])
+
     return conv
 
 
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
index fb6f7a589525..9a00fe272087 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -46,9 +46,8 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
         K = K.value
     if isinstance(N, tvm.tir.IntImm):
         N = N.value
-    assert K % 4 == 0
     # TODO(weberlo, areusch): support more dtypes?
-    assert in_dtype == "int8"
+    assert in_dtype in ("int8", "int16")
     assert out_dtype == "int32"
     A = te.placeholder((M, K), name="a", dtype=in_dtype)
     B = te.placeholder((N, K), name="b", dtype=in_dtype)
@@ -71,13 +70,14 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
     def intrin_func(ins, outs):
         aa, bb = ins
         cc = outs[0]
+        gemm_func_prefix = "gemm" if in_dtype == "int8" else "gemm16"
 
         def _reduce_update():
             ib = tvm.tir.ir_builder.create()
             ib.emit(
                 tvm.tir.call_extern(
                     "int32",
-                    f"gemm_{M}x{K}x{N}_update_{uniq_id}",
+                    f"{gemm_func_prefix}_{M}x{K}x{N}_update_{uniq_id}",
                     aa.access_ptr("r"),
                     bb.access_ptr("r"),
                     cc.access_ptr("w"),
@@ -102,7 +102,7 @@ def _body():
             ib.emit(
                 tvm.tir.call_extern(
                     "int32",
-                    f"gemm_{M}x{K}x{N}_body_{uniq_id}",
+                    f"{gemm_func_prefix}_{M}x{K}x{N}_body_{uniq_id}",
                     aa.access_ptr("r"),
                     bb.access_ptr("r"),
                     cc.access_ptr("w"),
@@ -122,7 +122,7 @@ def _body():
 def gemm_MxKxN_impl(M, K, N, uniq_id):
     """Emit C code for gemm impl."""
     # TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays quickly?
-    aa_pad_size = M * K
+    # aa_pad_size = M * K
     bb_pad_size = N * K
     # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
     cc_code = f"""
@@ -132,32 +132,270 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #include <arm_math.h>
 #include <arm_nnsupportfunctions.h>
 
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_body_rest_{uniq_id}(
+    int K,
+    int8_t *aa, int8_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  int k_base = (K / 4) * 4;
+  switch ( K % 4 ) {{
+  case 1:
+    for (int i = 0; i < {M}; i++) {{
+      for (int j = 0; j < {N}; j++) {{
+        int8_t *a_ptr = &aa[i * A_stride + k_base];
+        int8_t *b_ptr = &bb[j * B_stride + k_base];
+        cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
+      }}
+    }}
+    break;
+  case 2:
+    for (int i = 0; i < {M}; i++) {{
+      for (int j = 0; j < {N}; j++) {{
+        int8_t *a_ptr = &aa[i * A_stride + k_base];
+        int8_t *b_ptr = &bb[j * B_stride + k_base];
+        cc[i * C_stride + j] =   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+                               + (int32_t) a_ptr[1] * (int32_t) b_ptr[1];
+      }}
+    }}
+    break;
+  case 3:
+    for (int i = 0; i < {M}; i++) {{
+      for (int j = 0; j < {N}; j++) {{
+        int8_t *a_ptr = &aa[i * A_stride + k_base];
+        int8_t *b_ptr = &bb[j * B_stride + k_base];
+        cc[i * C_stride + j] =   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+                               + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]
+                               + (int32_t) a_ptr[2] * (int32_t) b_ptr[2];
+      }}
+    }}
+    break;
+  }}
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}(
+    int8_t *aa, int8_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      int32_t sum = 0;
+      for (int l = 0; l < {K}; l++) {{
+        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+      }}
+      // NOTE: this is the line where `*_body` differs from `*_update`. here
+      // we're *setting* the result, instead of accumulating, because we know
+      // the `i` and `j` itervars span their entire respective axes.
+      cc[i*C_stride + j] = sum;
+    }}
+  }}
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
 __STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
-  int16_t aa_pad[{aa_pad_size}];
   int16_t bb_pad[{bb_pad_size}];
 
+  if ( {M} < 16 || {N} < 16 )
+    return gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+
+  for (int i = 0; i < {N}; i++)
+    for (int j = 0; j < {K} / 4; j++)
+      read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+
   for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {K} / 4; j++) {{
-      read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
+    int16_t aa_pad_line[{K}];
+    for (int l = 0; l < {K} / 4; l++)
+      read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4], (int32_t*) &aa_pad_line[l*4 + 2]);
+
+    for (int j = 0; j < {N}; j++) {{
+      int32_t *aa_ptr = (int32_t *) aa_pad_line;
+      int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
+      int32_t sum = 0;
+      for (int l = 0; l < 2 * ({K} / 4); l++) {{
+        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        ++ aa_ptr; ++ bb_ptr;
+      }}
+      // NOTE: this is the line where `*_body` differs from `*_update`. here
+      // we're *setting* the result, instead of accumulating, because we know
+      // the `i` and `j` itervars span their entire respective axes.
+      cc[i*C_stride + j] = sum;
     }}
   }}
 
-  for (int i = 0; i < {N}; i++) {{
-    for (int j = 0; j < {K} / 4; j++) {{
+  if ( {K} % 4 != 0 )
+    gemm_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
+
+  return 0;
+}}
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_update_rest_{uniq_id}(
+    int K,
+    int8_t *aa, int8_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  int k_base = (K / 4) * 4;
+  switch ( K % 4 ) {{
+  case 1:
+    for (int i = 0; i < {M}; i++) {{
+      for (int j = 0; j < {N}; j++) {{
+        int8_t *a_ptr = &aa[i * A_stride + k_base];
+        int8_t *b_ptr = &bb[j * B_stride + k_base];
+        cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
+      }}
+    }}
+    break;
+  case 2:
+    for (int i = 0; i < {M}; i++) {{
+      for (int j = 0; j < {N}; j++) {{
+        int8_t *a_ptr = &aa[i * A_stride + k_base];
+        int8_t *b_ptr = &bb[j * B_stride + k_base];
+        cc[i * C_stride + j] +=   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+                                + (int32_t) a_ptr[1] * (int32_t) b_ptr[1];
+      }}
+    }}
+    break;
+  case 3:
+    for (int i = 0; i < {M}; i++) {{
+      for (int j = 0; j < {N}; j++) {{
+        int8_t *a_ptr = &aa[i * A_stride + k_base];
+        int8_t *b_ptr = &bb[j * B_stride + k_base];
+        cc[i * C_stride + j] +=   (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+                                + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]
+                                + (int32_t) a_ptr[2] * (int32_t) b_ptr[2];
+      }}
+    }}
+    break;
+  }}
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}(
+    int8_t *aa, int8_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      int32_t sum = 0;
+      for (int l = 0; l < {K}; l++) {{
+        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+      }}
+      cc[i*C_stride + j] += sum;
+    }}
+  }}
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
+    int8_t *aa, int8_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  int16_t bb_pad[{bb_pad_size}];
+
+  if ( {M} < 16 || {N} < 16 )
+    return gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+
+  for (int i = 0; i < {N}; i++)
+    for (int j = 0; j < {K} / 4; j++)
       read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+
+  for (int i = 0; i < {M}; i++) {{
+    int16_t aa_pad_line[{K}];
+    for (int l = 0; l < {K} / 4; l++)
+      read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4], (int32_t*) &aa_pad_line[l*4 + 2]);
+
+    for (int j = 0; j < {N}; j++) {{
+      int32_t *aa_ptr = (int32_t *) aa_pad_line;
+      int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
+      int32_t sum = 0;
+      for (int l = 0; l < 2 * ({K} / 4); l++) {{
+        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        ++ aa_ptr; ++ bb_ptr;
+      }}
+      cc[i*C_stride + j] += sum;
+    }}
+  }}
+
+  if ( {K} % 4 != 0 )
+    gemm_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
+
+  return 0;
+}}
+
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_body_rest_{uniq_id}(
+    int K,
+    int16_t *aa, int16_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  int k_base = (K / 2) * 2;
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      int16_t *a_ptr = &aa[i * A_stride + k_base];
+      int16_t *b_ptr = &bb[j * B_stride + k_base];
+      cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
+    }}
+  }}
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(
+    int16_t *aa, int16_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      int32_t sum = 0;
+      for (int l = 0; l < {K}; l++) {{
+        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+      }}
+      // NOTE: this is the line where `*_body` differs from `*_update`. here
+      // we're *setting* the result, instead of accumulating, because we know
+      // the `i` and `j` itervars span their entire respective axes.
+      cc[i*C_stride + j] = sum;
     }}
   }}
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
+    int16_t *aa, int16_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{  
+  if ( {M} < 2 || {N} < 2 )
+    return gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);  
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
+      int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
+      int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+    
       int32_t sum = 0;
       for (int l = 0; l < {K} / 2; l++) {{
-        sum = __SMLAD(
-          *((int32_t*) &aa_pad[i*{K} + l*2]),
-          *((int32_t*) &bb_pad[j*{K} + l*2]),
-          sum);
+        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        ++ aa_ptr; ++ bb_ptr;
       }}
       // NOTE: this is the line where `*_body` differs from `*_update`. here
       // we're *setting* the result, instead of accumulating, because we know
@@ -166,46 +404,80 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     }}
   }}
 
+  if ( {K} % 2 != 0 )
+    gemm16_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
+
   return 0;
 }}
 
+
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
-    int8_t *aa, int8_t *bb, int32_t *cc,
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_update_rest_{uniq_id}(
+    int K,
+    int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
-  int16_t aa_pad[{aa_pad_size}];
-  int16_t bb_pad[{bb_pad_size}];
-
+  int k_base = (K / 2) * 2;
   for (int i = 0; i < {M}; i++) {{
-    for (int j = 0; j < {K} / 4; j++) {{
-      read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
+    for (int j = 0; j < {N}; j++) {{
+      int16_t *a_ptr = &aa[i * A_stride + k_base];
+      int16_t *b_ptr = &bb[j * B_stride + k_base];
+      cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
     }}
   }}
+  return 0;
+}}
 
-  for (int i = 0; i < {N}; i++) {{
-    for (int j = 0; j < {K} / 4; j++) {{
-      read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(
+    int16_t *aa, int16_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{
+  for (int i = 0; i < {M}; i++) {{
+    for (int j = 0; j < {N}; j++) {{
+      int32_t sum = 0;
+      for (int l = 0; l < {K}; l++) {{
+        sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+      }}
+      cc[i*C_stride + j] += sum;
     }}
   }}
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
+    int16_t *aa, int16_t *bb, int32_t *cc,
+    int A_stride, int B_stride, int C_stride) {{  
+  if ( {M} < 2 || {N} < 2 )
+    return gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);  
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
+      int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
+      int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+
       int32_t sum = 0;
       for (int l = 0; l < {K} / 2; l++) {{
-        sum = __SMLAD(
-          *((int32_t*) &aa_pad[i*{K} + l*2]),
-          *((int32_t*) &bb_pad[j*{K} + l*2]),
-          sum);
+        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        ++ aa_ptr; ++ bb_ptr;
       }}
       cc[i*C_stride + j] += sum;
     }}
   }}
 
+  if ( {K} % 2 != 0 )
+    gemm16_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
+
   return 0;
 }}
 
+
+
 #ifdef __cplusplus
 extern "C"
 #endif
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index 54c3de252f8a..c27c869509d7 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -14,8 +14,21 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import io
+import os
 import json
 import pathlib
+import logging
+import tarfile
+
+import numpy as np
+
+from urllib.request import urlopen, urlretrieve
+from urllib.error import HTTPError
+import json
+import requests
+
+import tvm.micro
 
 
 TEMPLATE_PROJECT_DIR = (
@@ -60,3 +73,119 @@ def has_fpu(board: str):
 
     fpu_boards = [name for name, board in board_properties.items() if board["fpu"]]
     return board in fpu_boards
+
+
+def build_project(temp_dir, zephyr_board, west_cmd, mod, build_config, extra_files_tar=None):
+    project_dir = temp_dir / "project"
+    project = tvm.micro.generate_project(
+        str(TEMPLATE_PROJECT_DIR),
+        mod,
+        project_dir,
+        {
+            "extra_files_tar": extra_files_tar,
+            "project_type": "aot_demo",
+            "west_cmd": west_cmd,
+            "verbose": bool(build_config.get("debug")),
+            "zephyr_board": zephyr_board,
+        },
+    )
+    project.build()
+    return project, project_dir
+
+
+def create_header_file(tensor_name, npy_data, output_path, tar_file):
+    """
+    This method generates a header file containing the data contained in the numpy array provided.
+    It is used to capture the tensor data (for both inputs and expected outputs).
+    """
+    header_file = io.StringIO()
+    header_file.write("#include <stddef.h>\n")
+    header_file.write("#include <stdint.h>\n")
+    header_file.write("#include <dlpack/dlpack.h>\n")
+    header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
+
+    if npy_data.dtype == "int8":
+        header_file.write(f"int8_t {tensor_name}[] =")
+    elif npy_data.dtype == "int32":
+        header_file.write(f"int32_t {tensor_name}[] = ")
+    elif npy_data.dtype == "uint8":
+        header_file.write(f"uint8_t {tensor_name}[] = ")
+    elif npy_data.dtype == "float32":
+        header_file.write(f"float {tensor_name}[] = ")
+    else:
+        raise ValueError("Data type not expected.")
+
+    header_file.write("{")
+    for i in np.ndindex(npy_data.shape):
+        header_file.write(f"{npy_data[i]}, ")
+    header_file.write("};\n\n")
+
+    header_file_bytes = bytes(header_file.getvalue(), "utf-8")
+    raw_path = pathlib.Path(output_path) / f"{tensor_name}.h"
+    ti = tarfile.TarInfo(name=str(raw_path))
+    ti.size = len(header_file_bytes)
+    ti.mode = 0o644
+    ti.type = tarfile.REGTYPE
+    tar_file.addfile(ti, io.BytesIO(header_file_bytes))
+
+
+def _read_line(fd, timeout_sec: int):
+    data = ""
+    new_line = False
+    while True:
+        if new_line:
+            break
+        new_data = fd.read(1, timeout_sec=timeout_sec)
+        logging.debug(f"read data: {new_data}")
+        for item in new_data:
+            new_c = chr(item)
+            data = data + new_c
+            if new_c == "\n":
+                new_line = True
+                break
+    return data
+
+
+def get_message(fd, expr: str, timeout_sec: int):
+    while True:
+        data = _read_line(fd, timeout_sec)
+        logging.debug(f"new line: {data}")
+        if expr in data:
+            return data
+
+
+# TODO move CMSIS integration to microtvm_api_server.py
+# see https://discuss.tvm.apache.org/t/tvm-capturing-dependent-libraries-of-code-generated-tir-initially-for-use-in-model-library-format/11080
+def loadCMSIS(temp_dir):
+    REPO_PATH = "ARM-software/CMSIS_5"
+    BRANCH = "master"
+    API_PATH_URL = f"https://api.github.com/repos/{REPO_PATH}/git/trees"
+    RAW_PATH_URL = f"https://raw.githubusercontent.com/{REPO_PATH}/{BRANCH}"
+
+    url = "https://api.github.com/repos/ARM-software/CMSIS_5/git/trees/master?recursive=1"
+    r = requests.get(url)
+    res = r.json()
+
+    include_trees = {}
+
+    for file in res["tree"]:
+        if file["path"] in {"CMSIS/DSP/Include", "CMSIS/DSP/Include/dsp", "CMSIS/NN/Include"}:
+            include_trees.update({file["path"]: file["sha"]})
+
+    for path, sha in include_trees.items():
+        url = f"{API_PATH_URL}/{sha}"
+        content = json.load(urlopen(url))
+        temp_path = f"{temp_dir}"
+        if path == "CMSIS/DSP/Include/dsp":
+            temp_path = f"{temp_dir}/dsp"
+            if not os.path.isdir(temp_path):
+                os.makedirs(temp_path)
+        for item in content["tree"]:
+            if item["type"] == "blob":
+                file_name = item["path"]
+                file_url = f"{RAW_PATH_URL}/{path}/{file_name}"
+                print(file_name, "   ", file_url)
+                try:
+                    urlretrieve(file_url, f"{temp_path}/{file_name}")
+                except HTTPError as e:
+                    print(f"Failed to download {file_url}: {e}")
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index f03b8ecce6d0..a8a7a99a34dd 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -27,6 +27,7 @@
 import numpy as np
 
 import tvm
+import tvm.testing
 from tvm.micro.project_api import server
 import tvm.relay as relay
 
@@ -36,85 +37,6 @@
 import test_utils
 
 
-def _build_project(temp_dir, zephyr_board, west_cmd, mod, build_config, extra_files_tar=None):
-    project_dir = temp_dir / "project"
-    project = tvm.micro.generate_project(
-        str(test_utils.TEMPLATE_PROJECT_DIR),
-        mod,
-        project_dir,
-        {
-            "extra_files_tar": extra_files_tar,
-            "project_type": "aot_demo",
-            "west_cmd": west_cmd,
-            "verbose": bool(build_config.get("debug")),
-            "zephyr_board": zephyr_board,
-        },
-    )
-    project.build()
-    return project, project_dir
-
-
-def _create_header_file(tensor_name, npy_data, output_path, tar_file):
-    """
-    This method generates a header file containing the data contained in the numpy array provided.
-    It is used to capture the tensor data (for both inputs and expected outputs).
-    """
-    header_file = io.StringIO()
-    header_file.write("#include <stddef.h>\n")
-    header_file.write("#include <stdint.h>\n")
-    header_file.write("#include <dlpack/dlpack.h>\n")
-    header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
-
-    if npy_data.dtype == "int8":
-        header_file.write(f"int8_t {tensor_name}[] =")
-    elif npy_data.dtype == "int32":
-        header_file.write(f"int32_t {tensor_name}[] = ")
-    elif npy_data.dtype == "uint8":
-        header_file.write(f"uint8_t {tensor_name}[] = ")
-    elif npy_data.dtype == "float32":
-        header_file.write(f"float {tensor_name}[] = ")
-    else:
-        raise ValueError("Data type not expected.")
-
-    header_file.write("{")
-    for i in np.ndindex(npy_data.shape):
-        header_file.write(f"{npy_data[i]}, ")
-    header_file.write("};\n\n")
-
-    header_file_bytes = bytes(header_file.getvalue(), "utf-8")
-    raw_path = pathlib.Path(output_path) / f"{tensor_name}.h"
-    ti = tarfile.TarInfo(name=str(raw_path))
-    ti.size = len(header_file_bytes)
-    ti.mode = 0o644
-    ti.type = tarfile.REGTYPE
-    tar_file.addfile(ti, io.BytesIO(header_file_bytes))
-
-
-def _read_line(fd, timeout_sec: int):
-    data = ""
-    new_line = False
-    while True:
-        if new_line:
-            break
-        new_data = fd.read(1, timeout_sec=timeout_sec)
-        logging.debug(f"read data: {new_data}")
-        for item in new_data:
-            new_c = chr(item)
-            data = data + new_c
-            if new_c == "\n":
-                new_line = True
-                break
-    return data
-
-
-def _get_message(fd, expr: str, timeout_sec: int):
-    while True:
-        data = _read_line(fd, timeout_sec)
-        logging.debug(f"new line: {data}")
-        if expr in data:
-            return data
-
-
 @tvm.testing.requires_micro
 def test_tflite(temp_dir, board, west_cmd, tvm_debug):
     """Testing a TFLite model."""
@@ -175,12 +97,12 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
                 )
                 tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
 
-            _create_header_file("input_data", sample, "include", tf)
-            _create_header_file(
+            test_utils.create_header_file("input_data", sample, "include", tf)
+            test_utils.create_header_file(
                 "output_data", np.zeros(shape=output_shape, dtype="float32"), "include", tf
             )
 
-        project, _ = _build_project(
+        project, _ = test_utils.build_project(
             temp_dir,
             board,
             west_cmd,
@@ -192,9 +114,9 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
     project.flash()
     with project.transport() as transport:
         timeout_read = 60
-        _get_message(transport, "#wakeup", timeout_sec=timeout_read)
+        test_utils.get_message(transport, "#wakeup", timeout_sec=timeout_read)
         transport.write(b"start\n", timeout_sec=5)
-        result_line = _get_message(transport, "#result", timeout_sec=timeout_read)
+        result_line = test_utils.get_message(transport, "#result", timeout_sec=timeout_read)
 
     result_line = result_line.strip("\n")
     result_line = result_line.split(":")
@@ -236,10 +158,14 @@ def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug):
                     lowered.libmod_name, ["input_1"], ["output"], model_files_path
                 )
                 tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
-            _create_header_file("input_data", np.zeros(shape=shape, dtype=dtype), "include", tf)
-            _create_header_file("output_data", np.zeros(shape=shape, dtype=dtype), "include", tf)
+            test_utils.create_header_file(
+                "input_data", np.zeros(shape=shape, dtype=dtype), "include", tf
+            )
+            test_utils.create_header_file(
+                "output_data", np.zeros(shape=shape, dtype=dtype), "include", tf
+            )
 
-        project, project_dir = _build_project(
+        project, project_dir = test_utils.build_project(
             temp_dir,
             board,
             west_cmd,
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
new file mode 100644
index 000000000000..350f7e242304
--- /dev/null
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -0,0 +1,225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import os
+import pathlib
+import sys
+import tarfile
+import tempfile
+
+import pytest
+import numpy as np
+
+import test_utils
+
+import tvm
+import tvm.rpc
+import tvm.micro
+import tvm.testing
+from tvm import relay
+
+from tvm.contrib.download import download_testdata
+from tvm.micro.interface_api import generate_c_interface_header
+
+import conftest
+
+
+_LOG = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def _open_tflite_model():
+    # Import TFLite model
+
+    model_url = "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/mnist_model_quant.tflite"
+    model_path = download_testdata(model_url, "mnist_model_quant.tflite", module="model")
+
+    tflite_model_buf = open(model_path, "rb").read()
+
+    try:
+        import tflite
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite.Model
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+    relay_mod, params = relay.frontend.from_tflite(tflite_model)
+
+    return relay_mod, params
+
+
+def _get_test_data(testdata_dir):
+
+    from PIL import Image
+
+    image_files = ["digit-2.jpg"]
+
+    for file in image_files:
+        img = Image.open(testdata_dir / file).resize((28, 28))
+        img = np.asarray(img).astype("uint8")
+        sample = np.reshape(img, -1)
+
+    output_shape = (1, 10)
+
+    return sample, output_shape
+
+
+def _apply_desired_layout_simd(relay_mod):
+
+    desired_layouts = {"qnn.conv2d": ["NHWC", "HWOI"], "nn.conv2d": ["NHWC", "HWOI"]}
+
+    seq = tvm.transform.Sequential(
+        [relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)]
+    )
+
+    with tvm.transform.PassContext(opt_level=3):
+        return seq(relay_mod)
+
+
+def _apply_desired_layout_no_simd(relay_mod):
+
+    desired_layouts = {"qnn.conv2d": ["NHWC", "HWIO"], "nn.conv2d": ["NHWC", "HWIO"]}
+
+    seq = tvm.transform.Sequential(
+        [relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)]
+    )
+
+    with tvm.transform.PassContext(opt_level=3):
+        return seq(relay_mod)
+
+
+def _generate_project(temp_dir, board, west_cmd, lowered, build_config, sample, output_shape):
+
+    with tempfile.NamedTemporaryFile() as tar_temp_file:
+        with tarfile.open(tar_temp_file.name, "w:gz") as tf:
+            with tempfile.TemporaryDirectory() as tar_temp_dir:
+                model_files_path = os.path.join(tar_temp_dir, "include")
+                os.mkdir(model_files_path)
+                test_utils.loadCMSIS(model_files_path)
+                tf.add(model_files_path, arcname=os.path.relpath(model_files_path, tar_temp_dir))
+                header_path = generate_c_interface_header(
+                    lowered.libmod_name, ["input_1"], ["output"], model_files_path
+                )
+                tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
+
+            test_utils.create_header_file("input_data", sample, "include", tf)
+            test_utils.create_header_file(
+                "output_data", np.zeros(shape=output_shape, dtype="float32"), "include", tf
+            )
+
+        project, _ = test_utils.build_project(
+            temp_dir,
+            board,
+            west_cmd,
+            lowered,
+            build_config,
+            extra_files_tar=tar_temp_file.name,
+        )
+
+    return project
+
+
+def _run_model(temp_dir, board, west_cmd, lowered, build_config, sample, output_shape):
+
+    project = _generate_project(
+        temp_dir, board, west_cmd, lowered, build_config, sample, output_shape
+    )
+
+    project.flash()
+
+    with project.transport() as transport:
+        timeout_read = 60
+        transport.write(b"start\n", timeout_sec=5)
+        result_line = test_utils.get_message(transport, "#result", timeout_sec=timeout_read)
+
+    result_line = result_line.strip("\n")
+    result_line = result_line.split(":")
+    result = int(result_line[1])
+    time = int(result_line[2])
+    logging.info(f"Result: {result}\ttime: {time} ms")
+
+    return result, time
+
+
+@tvm.testing.requires_micro
+def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
+    """Testing a ARM v7m SIMD extension."""
+
+    if board not in [
+        "mps2_an521",
+        "stm32f746xx_disco",
+        "nucleo_f746zg",
+        "nucleo_l4r5zi",
+    ]:
+        pytest.skip(msg="Platform does not support ARM v7m SIMD extenion.")
+
+    model = test_utils.ZEPHYR_BOARDS[board]
+
+    build_config = {"debug": tvm_debug}
+
+    this_dir = pathlib.Path(os.path.dirname(__file__))
+    testdata_dir = this_dir.parent / "testdata" / "mnist"
+
+    relay_mod, params = _open_tflite_model()
+
+    sample, output_shape = _get_test_data(testdata_dir)
+
+    relay_mod_simd = _apply_desired_layout_simd(relay_mod)
+    # kernel layout "HWIO" is not supported by arm_cpu SIMD extension (see tvm\python\relay\op\strategy\arm_cpu.py)
+    relay_mod_no_simd = _apply_desired_layout_no_simd(relay_mod)
+
+    target = tvm.target.target.micro(
+        model,
+        options=[
+            "-keys=arm_cpu,cpu",
+            "-link-params=1",
+            "--executor=aot",
+            "--unpacked-api=1",
+            "--interface-api=c",
+        ],
+    )
+
+    temp_dir_simd = temp_dir / "simd"
+    temp_dir_no_simd = temp_dir / "nosimd"
+
+    os.makedirs(temp_dir_simd, exist_ok=True)
+    os.makedirs(temp_dir_no_simd, exist_ok=True)
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        lowered_simd = relay.build(relay_mod_simd, target, params=params)
+        lowered_no_simd = relay.build(relay_mod_no_simd, target, params=params)
+        result_simd, time_simd = _run_model(
+            temp_dir_simd, board, west_cmd, lowered_simd, build_config, sample, output_shape
+        )
+        result_no_simd, time_no_simd = _run_model(
+            temp_dir_no_simd, board, west_cmd, lowered_no_simd, build_config, sample, output_shape
+        )
+
+    assert result_no_simd == result_simd == 2
+
+    # Time performance measurements on QEMU emulator are always equal to zero.
+    if board not in [
+        "mps2_an521",
+    ]:
+        assert time_no_simd > time_simd
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))