From cf8536e08f81c3d9360465ade702defc6c17a583 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Fri, 25 Nov 2022 07:23:51 -0800
Subject: [PATCH 01/19] Add support for altered out layouts

---
 .../mprofile/dsp/micro_kernel/tensordot.py    |  9 ++-
 python/tvm/topi/arm_cpu/qnn.py                | 72 ++++++++++---------
 python/tvm/topi/arm_cpu/qnn_alter_op.py       | 18 ++++-
 .../arm_cpu/test_quantized_convolution.py     | 10 ++-
 .../python/test_topi_conv2d_tensordot_opts.py |  6 +-
 5 files changed, 72 insertions(+), 43 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
index 1d36e1dd1e9c..b89ddf7dea1f 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -286,7 +286,14 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
         num_packed = (num_outputs - offset) // 2
         for i in range(num_packed):
             index = 2 * i + offset
-            yield f"int32_t packed_res_{i} = requant_{index} + (requant_{index + 1} << 16);"
+            # We must explicitly call asm inline to use the PKHBT instruction. It is not part of
+            # ACLE and has no __builtin. Writing it using masks and bitshifts does not work either:
+            # Arm GCC 12 with -O3 does not compile these efficiently.
+            yield f"int packed_res_{i};"
+            yield (
+                f'asm inline ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_{i}) : '
+                f'"r" (requant_{index}), "r" (requant_{index + 1}));'
+            )
 
         if offset == 1:
             yield "((int16_t*) output)[1] = (int16_t) requant_0;"
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
index fad64cc09bb8..19ab31c51760 100644
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -20,7 +20,7 @@
 Arm Cortex-M with DSP.
 """
 
-from typing import Tuple
+from typing import Callable, Tuple
 
 import tvm
 from tvm import te
@@ -123,27 +123,44 @@ def _make_tscript_call(func_name, *args):
 
 
 def _make_conv2d_primfunc(
-    call_dimensions: Tuple,
+    output_dimensions: Tuple[int, int, int, int],
     buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
     aligned_func: Tuple[str, str],
     offset_func: Tuple[str, str],
-    ptr_gens: Tuple,
+    ptr_gens: Tuple[Callable, Callable],
+    output_layout="NHWC",
 ):
-    height, width, out_channels = call_dimensions
+    out_height, out_width, out_channels, num_sums = output_dimensions
     data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
     aligned_func_name, aligned_func_code = aligned_func
     offset_func_name, offset_func_code = offset_func
-    output_ptr, data_ptr, kernel_ptr = ptr_gens
+    data_ptr, kernel_ptr = ptr_gens
 
     # If the functions are identical, we can skip the second loop
     if aligned_func_name == offset_func_name:
         aligned_channels = out_channels
-        offset_channels = tvm.tir.const(0)
-        c_step = tvm.tir.const(1)
+        offset_channels = 0
+        c_step = const(1)
     else:
         aligned_channels = out_channels // 2
         offset_channels = out_channels // 2
-        c_step = tvm.tir.const(2)
+        c_step = const(2)
+
+    def output_ptr(output, y, x, c):
+        if output_layout == "NHWC":
+            return _make_tscript_ptr(
+                output,
+                y * const(out_width * out_channels) + x * const(out_channels * num_sums) + c,
+                1,
+            )
+        elif output_layout == "NCHW":
+            return _make_tscript_ptr(
+                output,
+                c * const(out_height * out_width) + y * const(out_width) + x * const(num_sums),
+                1,
+            )
+        else:
+            raise TVMError(f"Unsupported out_layout '{output_layout}'!")
 
     def bias_ptr(bias, c):
         return _make_tscript_ptr(bias, c, 1, dtype="int32")
@@ -181,7 +198,9 @@ def biased_quantized_conv2d(
         __4 = scale[0]
         # pylint: enable=unused-variable
 
-        for c_ax, y_ax, x_ax in T.grid(aligned_channels, height, width):
+        for c_ax, y_ax, x_ax in T.grid(
+            const(aligned_channels), const(out_height), const(out_width // num_sums)
+        ):
             with T.block("conv2d_aligned"):
                 T.block_attr({"pragma_import_c": aligned_func_code})
                 y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
@@ -194,7 +213,9 @@ def biased_quantized_conv2d(
                     scale_ptr(scale, c * c_step),
                 )
 
-        for c_ax, y_ax, x_ax in T.grid(offset_channels, height, width):
+        for c_ax, y_ax, x_ax in T.grid(
+            const(offset_channels), const(out_height), const(out_width // num_sums)
+        ):
             with T.block("conv2d_offset"):
                 T.block_attr({"pragma_import_c": offset_func_code})
                 y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
@@ -222,8 +243,6 @@ def qnn_conv2d(attrs, inputs, out_type):
     # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper.
     assert len(inputs) == 11
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    output_layout = attrs.out_layout
-    assert output_layout == "NHWC"
 
     _, height, width, in_channels = get_const_tuple(data.shape)
     out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
@@ -253,14 +272,6 @@ def qnn_conv2d(attrs, inputs, out_type):
 
     aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, False)
 
-    # Helper functions to make pointers
-    def output_ptr(buffer, y, x, c):
-        return _make_tscript_ptr(
-            buffer,
-            y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
-            1,
-        )
-
     # We need to disable pylint's unused argument checker, as the kwarg offset is unused but must
     # be present for compatibility. We cannot add an underscore as we normally would, as this makes
     # the keyword not match.
@@ -284,11 +295,12 @@ def kernel_ptr(buffer, c, offset=0):
         )
 
     prim_func = _make_conv2d_primfunc(
-        (const(out_height), const(out_width // num_outputs), const(out_channels)),
+        (out_height, out_width, out_channels, num_sums),
         (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
         aligned_func,
         offset_func,
-        (output_ptr, data_ptr, kernel_ptr),
+        (data_ptr, kernel_ptr),
+        output_layout=attrs.out_layout,
     )
 
     output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
@@ -308,12 +320,9 @@ def qnn_depthwise_conv2d(attrs, inputs, out_type):
 
     assert len(inputs) == 11
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    output_layout = attrs.out_layout
-    assert output_layout == "NHWC"
 
     _, _, height, width = get_const_tuple(data.shape)
     _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-    _, out_height, out_width, _ = get_const_tuple(out_type.shape)
     y_stride, x_stride = get_const_tuple(attrs.strides)
 
     out_height = _compute_output_dim(height, kernel_h, y_stride)
@@ -323,14 +332,6 @@ def qnn_depthwise_conv2d(attrs, inputs, out_type):
 
     aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, True)
 
-    # Helper functions for making pointers.
-    def output_ptr(buffer, y, x, c):
-        return _make_tscript_ptr(
-            buffer,
-            y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
-            1,
-        )
-
     def data_ptr(buffer, y, x, c, offset=0):
         if height * width % 2 == 1:
             x_ptr_offset = tvm.tir.const(-1)
@@ -354,11 +355,12 @@ def kernel_ptr(buffer, c, offset=0):
         )
 
     prim_func = _make_conv2d_primfunc(
-        (const(out_height), const(out_width // num_outputs), const(out_channels)),
+        (out_height, out_width, out_channels, num_sums),
         (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
         aligned_func,
         offset_func,
-        (output_ptr, data_ptr, kernel_ptr),
+        (data_ptr, kernel_ptr),
+        output_layout=attrs.out_layout,
     )
 
     output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
index 00225493db96..a0e9641136ee 100644
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -79,7 +79,7 @@ def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
     optimization means we must pad tensors with the input zero point, and NOT with zero.
     """
 
-    prev_op, biases = inputs
+    prev_op, biases_data_op = inputs
     if not hasattr(prev_op, "op"):
         return None
     if prev_op.op.name != "qnn.conv2d":
@@ -108,6 +108,13 @@ def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
     # The zero point is subtracted from the input elements, so we need a "-" sign here
     zp_shifted_sums = element_sums * (-conv_input_zp)
 
+    # The bias values may or may not be wrapped in an expand_dims op
+    if isinstance(biases_data_op, relay.expr.Call):
+        biases = biases_data_op.args[0]
+    else:
+        biases = biases_data_op
+    assert isinstance(biases, relay.expr.Constant)
+
     # We want to make sure new_biases is representable as an int32. It's tempting to just check
     # whether arr.dtype == "int32" (since Numpy will automatically increase dtype in some cases)
     # but this leads to weird wrapping behavior and doesn't work. We must do it manually.
@@ -119,4 +126,11 @@ def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
     new_conv_args = (*prev_op.args[:2], new_input_zp, *prev_op.args[3:])
     new_conv_op = relay.qnn.op.conv2d(*new_conv_args, **prev_op.attrs)
     bias_constant = relay.Constant(nd.array(new_biases.astype("int32")))
-    return relay.add(new_conv_op, bias_constant)
+
+    # If biases was wrapped in an expand_dims op, we must re-wrap it
+    if isinstance(biases_data_op, relay.expr.Call):
+        new_biases_op = relay.expand_dims(bias_constant, **biases_data_op.attrs)
+    else:
+        new_biases_op = bias_constant
+
+    return relay.add(new_conv_op, new_biases_op)
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index 573231f9632c..43a33352388b 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -284,9 +284,10 @@ def _make_executor():
     )
 
 
+@pytest.mark.parametrize("output_layout", ["NHWC", "NCHW"])
 @pytest.mark.parametrize("layer", range(23))
 @tvm.testing.requires_corstone300
-def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
+def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer, output_layout):
     """Checks microTVM output against TFLite for one MobileNetV1 layer.
 
     Loads the input, kernel, bias, expected output, and quantization parameters from the specified
@@ -309,6 +310,9 @@ def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
 
     layer: int
         The index of the layer to check against TensorFlow's ground truth values.
+
+    output_layout: str
+        The output_layout for microTVM to use. Does not have to match the TensorFlow layout.
     """
     dtype = "int16"
 
@@ -316,9 +320,9 @@ def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
 
     padding, strides, is_depthwise = _get_mobilenet_v1_layer_attributes(layer)
     if is_depthwise:
-        data_layout, kernel_layout, output_layout = "NCHW", "OIHW", "NHWC"
+        data_layout, kernel_layout = "NCHW", "OIHW"
     else:
-        data_layout, kernel_layout, output_layout = "NHWC", "OHWI", "NHWC"
+        data_layout, kernel_layout = "NHWC", "OHWI"
 
     test_model = _make_aot_model(
         (tensor, kernel, bias, output),
diff --git a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
index 46d2797ba394..fc0b556e9d10 100644
--- a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
+++ b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
@@ -308,8 +308,10 @@ def test_1x1x8_convolution_code():
       requant_3 = (requant_3 + 1) >> 1;
       requant_3 = __ssat(requant_3 + -128, 8);
 
-      int32_t packed_res_0 = requant_0 + (requant_1 << 16);
-      int32_t packed_res_1 = requant_2 + (requant_3 << 16);
+      int packed_res_0;
+      asm inline ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_0) : "r" (requant_0), "r" (requant_1));
+      int packed_res_1;
+      asm inline ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_1) : "r" (requant_2), "r" (requant_3));
       output[0] = packed_res_0;
       output[1] = packed_res_1;
       return 0;

From 476426d23d837af22212672ed07a9019e309e365 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Fri, 25 Nov 2022 09:16:25 -0800
Subject: [PATCH 02/19] Support more VWW layers

Fix rebase conflict
---
 python/tvm/topi/arm_cpu/qnn.py                | 36 ++++++++++++-------
 .../arm_cpu/test_quantized_convolution.py     | 10 +++++-
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
index 19ab31c51760..5d5f0b2cf77c 100644
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -23,7 +23,7 @@
 from typing import Callable, Tuple
 
 import tvm
-from tvm import te
+from tvm import te, TVMError
 from tvm.tir import const
 from tvm.script import tir as T
 from ..utils import get_const_tuple
@@ -38,6 +38,19 @@ def _compute_output_dim(data_length, kernel_length, stride):
     return int_ceil_division(data_length + 1 - kernel_length, stride)
 
 
+def _pick_num_outputs(out_width):
+    """Guess a good value for num_outputs."""
+
+    assert out_width > 1
+
+    # num_outputs is capped at 8
+    for i in range(2, min(out_width + 1, 8)):
+        if out_width % i == 0:
+            return i
+
+    raise TVMError(f"Cannot pick a good num_outputs value for out_width = {out_width}!")
+
+
 def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False):
     """Helper function that chooses the right implementation of micro_kernel.tensordot.
 
@@ -130,7 +143,7 @@ def _make_conv2d_primfunc(
     ptr_gens: Tuple[Callable, Callable],
     output_layout="NHWC",
 ):
-    out_height, out_width, out_channels, num_sums = output_dimensions
+    out_height, out_width, out_channels, num_outputs = output_dimensions
     data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
     aligned_func_name, aligned_func_code = aligned_func
     offset_func_name, offset_func_code = offset_func
@@ -150,13 +163,13 @@ def output_ptr(output, y, x, c):
         if output_layout == "NHWC":
             return _make_tscript_ptr(
                 output,
-                y * const(out_width * out_channels) + x * const(out_channels * num_sums) + c,
+                y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
                 1,
             )
         elif output_layout == "NCHW":
             return _make_tscript_ptr(
                 output,
-                c * const(out_height * out_width) + y * const(out_width) + x * const(num_sums),
+                c * const(out_height * out_width) + y * const(out_width) + x * const(num_outputs),
                 1,
             )
         else:
@@ -199,7 +212,7 @@ def biased_quantized_conv2d(
         # pylint: enable=unused-variable
 
         for c_ax, y_ax, x_ax in T.grid(
-            const(aligned_channels), const(out_height), const(out_width // num_sums)
+            const(aligned_channels), const(out_height), const(out_width // num_outputs)
         ):
             with T.block("conv2d_aligned"):
                 T.block_attr({"pragma_import_c": aligned_func_code})
@@ -214,7 +227,7 @@ def biased_quantized_conv2d(
                 )
 
         for c_ax, y_ax, x_ax in T.grid(
-            const(offset_channels), const(out_height), const(out_width // num_sums)
+            const(offset_channels), const(out_height), const(out_width // num_outputs)
         ):
             with T.block("conv2d_offset"):
                 T.block_attr({"pragma_import_c": offset_func_code})
@@ -254,9 +267,8 @@ def qnn_conv2d(attrs, inputs, out_type):
     # Decide how many sums our function should have running at the same time. Doing
     # this lets us do "more work" for each memory load, but doing too many of them causes us to run
     # out of registers. Currently this is set to either 1 or 2, but autotuning this value would
-    # improve performance a lot. Tracked by https://github.com/apache/tvm/issues/13528.
-
-    num_outputs = 2
+    # improve performance a lot.
+    num_outputs = _pick_num_outputs(out_width)
 
     # Next, decide whether whether we need "parity alternation". For example, if we have an
     # 8x3x3x3 kernel (8 output channels, height 3, width 3, input channels 3) in the OHWI layout,
@@ -295,7 +307,7 @@ def kernel_ptr(buffer, c, offset=0):
         )
 
     prim_func = _make_conv2d_primfunc(
-        (out_height, out_width, out_channels, num_sums),
+        (out_height, out_width, out_channels, num_outputs),
         (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
         aligned_func,
         offset_func,
@@ -328,7 +340,7 @@ def qnn_depthwise_conv2d(attrs, inputs, out_type):
     out_height = _compute_output_dim(height, kernel_h, y_stride)
     out_width = _compute_output_dim(width, kernel_w, x_stride)
 
-    num_outputs = 2
+    num_outputs = _pick_num_outputs(out_width)
 
     aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, True)
 
@@ -355,7 +367,7 @@ def kernel_ptr(buffer, c, offset=0):
         )
 
     prim_func = _make_conv2d_primfunc(
-        (out_height, out_width, out_channels, num_sums),
+        (out_height, out_width, out_channels, num_outputs),
         (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
         aligned_func,
         offset_func,
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index 43a33352388b..6e7fc38fa349 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -284,8 +284,16 @@ def _make_executor():
     )
 
 
+def _make_parameterization(layer_num):
+    if layer_num == 25:
+        mark = pytest.mark.xfail(reason="unaligned input data ptr", run=False)
+        return pytest.param(layer_num, marks=mark)
+    else:
+        return layer_num
+
+
 @pytest.mark.parametrize("output_layout", ["NHWC", "NCHW"])
-@pytest.mark.parametrize("layer", range(23))
+@pytest.mark.parametrize("layer", (_make_parameterization(n) for n in range(27)))
 @tvm.testing.requires_corstone300
 def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer, output_layout):
     """Checks microTVM output against TFLite for one MobileNetV1 layer.

From e2ed946c753f683a1ce09161e058056961ecbd23 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Wed, 30 Nov 2022 08:26:25 -0800
Subject: [PATCH 03/19] Begin work to modify Relay graph

Hackish work to adjust relay

Fix quantized test

Code showing more promise

Fix bugs

Hack to remove unnecessary cast

Infinite bias

Fancy alter op attempt

Attempt 2

Better layout alternation

WIP dump of new legalization work

More tech for Relay transform

Functioning alter op!

convolution -> average pool -> dense -> bias alteration

Move legalization logic to permanent home

Fix bugs from code move
---
 python/tvm/relay/frontend/tflite.py           |   3 +-
 python/tvm/relay/op/nn/_nn.py                 |  22 ++
 python/tvm/relay/qnn/strategy/arm_cpu.py      |  46 ++-
 python/tvm/topi/arm_cpu/__init__.py           |   3 +-
 .../mprofile/dsp/micro_kernel/tensordot.py    |   2 +-
 python/tvm/topi/arm_cpu/qnn.py                |  37 +-
 python/tvm/topi/arm_cpu/qnn_alter_op.py       | 174 ++++++---
 python/tvm/topi/arm_cpu/qnn_legalize.py       | 340 ++++++++++++++++++
 python/tvm/topi/nn/qnn.py                     |  33 +-
 src/relay/backend/utils.cc                    |   3 +-
 src/target/source/codegen_c.cc                |   4 +-
 .../arm_cpu/test_quantized_convolution.py     |  59 ++-
 12 files changed, 667 insertions(+), 59 deletions(-)
 create mode 100644 python/tvm/topi/arm_cpu/qnn_legalize.py

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index db21fa6668d1..19f6887e17fd 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -558,7 +558,8 @@ def convert_qnn_fused_activation_function(
         if fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
             return _op.clip(expr, a_min=max(qmin, quantize(-1.0)), a_max=min(qmax, quantize(1.0)))
         if fused_activation_fn == ActivationFunctionType.RELU:
-            return _op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
+            return expr
+            #return _op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
 
         fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
         raise tvm.error.OpNotImplemented(
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index e956c82828c1..404ad826cc6d 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -878,6 +878,28 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
 
 
 # QNN ops
+@reg.register_legalize("nn.bias_add")
+def legalize_bias_add(attrs, inputs, tinfos):
+    """Legalize a bias add operator.
+
+    May be used to "fold in" unused channels from quantized convolution operators. This should
+    be done before layout rewrites occur to minimize the amount of "extra" overhead operators
+    like "cast" and "layout_transform".
+    """
+    return topi.nn.qnn.qnn_bias_add_legalize(attrs, inputs, tinfos)
+
+
+@reg.register_alter_op_layout("qnn.conv2d")
+def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a qnn conv2d op.
+
+    May be used to alter the current QNN Conv2D op, but can also be used to alter previous ops to
+    better match the current op. For example, Arm Cortex-M uses this to set the out_layout of
+    previous ops to the input layout preferred by future layouts.
+    """
+    return topi.nn.qnn.qnn_conv2d_alter_layout(attrs, inputs, tinfos, out_type)
+
+
 @reg.register_alter_op_layout("add")
 def alter_op_layout_add(attrs, inputs, tinfos, out_type):
     """Alter the layout of a add op.
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
index f8653817835e..f9e7c7abac68 100644
--- a/python/tvm/relay/qnn/strategy/arm_cpu.py
+++ b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -21,11 +21,47 @@
 regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
 
 from tvm import topi, TVMError
-from .generic import qnn_conv2d_strategy
+from .generic import *
 from ... import op as _op
 from ...op.strategy.generic import is_depthwise_conv2d
 
 
+@qnn_quantize_strategy.register("arm_cpu")
+def qnn_quantize_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """qnn.quantize strategy for arm_cpu"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_quantize(topi.hexagon.qnn_quantize),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_quantize),
+        name="qnn_quantize.arm_cpu",
+    )
+    return strategy
+
+
+@qnn_dequantize_strategy.register("arm_cpu")
+def qnn_dequantize_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """qnn.dequantize strategy for arm_cpu"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_dequantize(topi.hexagon.qnn_dequantize),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_dequantize),
+        name="qnn_dequantize.arm_cpu",
+    )
+    return strategy
+
+
+@qnn_dense_strategy.register("arm_cpu")
+def qnn_dense_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """qnn.dense strategy for arm_cpu"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_qnn_dense(topi.hexagon.qnn_dense),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_dense),
+        name="qnn_dense.arm_cpu",
+    )
+    return strategy
+
+
 @qnn_conv2d_strategy.register("arm_cpu")
 def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
     """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
@@ -53,19 +89,23 @@ def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
     strategy = _op.OpStrategy()
 
     if groups == 1:
-        if data_layout == "NHWC" and kernel_layout == "OHWI":
+        if True or (data_layout == "NHWC" and kernel_layout == "OHWI"):
             strategy.add_implementation(
                 topi.arm_cpu.qnn_conv2d,
                 topi.arm_cpu.schedule_qnn_conv2d,
                 name="qnn_conv2d.arm_cpu",
             )
+        else:
+            raise TVMError("QNN regular Conv2D for Arm Cortex-M DSP got incorrect input layout!")
     elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
-        if data_layout == "NCHW" and kernel_layout == "IOHW":
+        if True or (data_layout == "NCHW" and kernel_layout == "IOHW"):
             strategy.add_implementation(
                 topi.arm_cpu.qnn_depthwise_conv2d,
                 topi.arm_cpu.schedule_qnn_depthwise_conv2d,
                 name="qnn_depthwise_conv2d.arm_cpu",
             )
+        else:
+            raise TVMError("QNN depthwise Conv2D for Arm Cortex-M DSP got incorrect input layout!")
     else:
         raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
 
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
index eba102662bc4..054103f43bef 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -23,7 +23,6 @@
 from .conv2d_transpose import *
 from .conv2d_int8 import *
 from . import conv2d_alter_op
-from . import qnn_alter_op
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
@@ -31,3 +30,5 @@
 from .pooling import *
 from .dense import *
 from .qnn import *
+from . import qnn_alter_op
+from . import qnn_legalize
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
index b89ddf7dea1f..d2a8f1ef6905 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -291,7 +291,7 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
             # Arm GCC 12 with -O3 does not compile these efficiently.
             yield f"int packed_res_{i};"
             yield (
-                f'asm inline ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_{i}) : '
+                f'__asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_{i}) : '
                 f'"r" (requant_{index}), "r" (requant_{index + 1}));'
             )
 
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
index 5d5f0b2cf77c..7fa80b99a939 100644
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -28,6 +28,7 @@
 from tvm.script import tir as T
 from ..utils import get_const_tuple
 from .mprofile.dsp.micro_kernel import tensordot
+from tvm.topi.nn.pad import pad
 
 
 def int_ceil_division(x, y):
@@ -255,12 +256,30 @@ def qnn_conv2d(attrs, inputs, out_type):
     # Make a few checks to unpack the function arguments and ensure it was called with the right
     # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper.
     assert len(inputs) == 11
+    assert attrs
+    import pdb
+
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
 
-    _, height, width, in_channels = get_const_tuple(data.shape)
     out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
     y_stride, x_stride = get_const_tuple(attrs.strides)
 
+    # rq_input_zero_point_const = inputs[8].op.body[0]
+    rq_input_zero_point_const = -128
+
+    padding = get_const_tuple(attrs.padding)
+    if any(padding):
+        pad_up, pad_left, pad_down, pad_right = padding
+        padded_data = pad(
+            data,
+            [0, pad_up, pad_left, 0],
+            [0, pad_down, pad_right, 0],
+            rq_input_zero_point_const,
+        )
+    else:
+        padded_data = data
+
+    _, height, width, in_channels = get_const_tuple(padded_data.shape)
     out_height = _compute_output_dim(height, kernel_h, y_stride)
     out_width = _compute_output_dim(width, kernel_w, x_stride)
 
@@ -333,10 +352,24 @@ def qnn_depthwise_conv2d(attrs, inputs, out_type):
     assert len(inputs) == 11
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
 
-    _, _, height, width = get_const_tuple(data.shape)
     _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
     y_stride, x_stride = get_const_tuple(attrs.strides)
 
+    # rq_input_zero_point_const = inputs[8].op.body[0]
+    rq_input_zero_point_const = -128
+    padding = get_const_tuple(attrs.padding)
+    if any(padding):
+        pad_up, pad_left, pad_down, pad_right = padding
+        padded_data = pad(
+            data,
+            [0, 0, pad_up, pad_left],
+            [0, 0, pad_down, pad_right],
+            rq_input_zero_point_const,
+        )
+    else:
+        padded_data = data
+
+    _, _, height, width = get_const_tuple(padded_data.shape)
     out_height = _compute_output_dim(height, kernel_h, y_stride)
     out_width = _compute_output_dim(width, kernel_w, x_stride)
 
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
index a0e9641136ee..fc419153173a 100644
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -19,57 +19,107 @@
 import numpy as np
 
 from tvm import nd, relay, target
-from ..nn import qnn_requantize_alter_layout, qnn_add_alter_layout
+from ..nn import *
 
 
-@qnn_requantize_alter_layout.register(["arm_cpu"])
-def alter_requantize_layout(attrs, inputs, _tinfos, _out_type):
-    """Changes a floating point requantize op to use int64 multiply + shift for microTVM.
+def prev_ops_match(curr_op, pattern):
+    prev_op = curr_op
+    for op_name in pattern:
+        if (not hasattr(prev_op, "op")) or prev_op.op.name != op_name:
+            return False
+        prev_op = prev_op.args[0]
+    return True
 
-    Usually, this is done by QNN legalization. However, microTVM wants to manually choose the
-    integer rounding constants in order to:
-        (a) Have int32, not int64 constants
-        (b) Use a constant rounding shift to skip a memory load.
 
-    Ideally, we would pick these constants in the requantize (or fused) schedule. Unfortunately that
-    is not currently possible, so we pick them with `alter_layout` as a hack. This will only work if
-    the requantize schedule "plays along" with this hack.
-    """
+def edit_attrs(attrs, **kwargs):
+    return {**attrs, **kwargs}
 
-    # Only microTVM Cortex-M boards with DSP use the relevant schedules
-    current_target = target.Target.current(allow_none=False)
-    if not (current_target.features.has_dsp and "cortex-m" in current_target.mcpu):
-        return None
 
-    _, in_scale, _, out_scale, _ = inputs
-    in_scale_numpy = in_scale.data.numpy().astype("float64")
-    out_scale_scalar = out_scale.data.numpy().item()
+def change_numpy_layout(arr, src_layout, dst_layout):
+    assert src_layout.isalpha() and dst_layout.isalpha()
+    axis_order = [src_layout.index(c) for c in dst_layout]
+    return np.transpose(arr, axis_order)
 
-    # Shifting by 33 and rounding means shifting by 32, adding 1, and shifting by 1 again. This is
-    # useful, because shifting a multiplication product by 32 can be done for "free" with SMMUL
-    scales = ((in_scale_numpy / out_scale_scalar) * 2**33).astype("int32")
 
-    # Requantize ops in Relay do not support int32 scales - if we try to use one, requantize.cc will
-    # raise an error. As a hacky work-around, we change the scale dtype to float32, without changing
-    # underlying data. This works, as our compute function knows to interpret the scale as an int32.
+def _squash_transformations(expr):
+    if isinstance(expr, relay.expr.Constant):
+        return expr.data.numpy()
+    assert isinstance(expr, relay.expr.Call)
+    assert len(expr.args) == 1
 
-    # This is only a work-around - a better long-term solution would be adding a new integer
-    # requantize op, which takes integer scales, shifts, and rounding behavior.
-    fake_float_scales = scales.view("float32")
+    prev_kernel = _squash_transformations(expr.args[0])
+    attrs = expr.attrs
 
-    scale_constant = relay.Constant(nd.array(fake_float_scales))
-    return relay.qnn.op.requantize(inputs[0], scale_constant, *inputs[2:], **attrs)
+    if expr.op.name == "layout_transform":
+        return change_numpy_layout(prev_kernel, attrs.src_layout, attrs.dst_layout)
+    elif expr.op.name == "cast":
+        return prev_kernel.astype(attrs.dtype)
+    elif kernel.op.name == "expand_dims":
+        new_axes = range(attrs.axis, attrs.axis + attrs.num_newaxis)
+        return np.expand_dims(prev_kernel, tuple(axes))
+    else:
+        raise RuntimeError(f"Invalid kernel transformation '{expr}'!")
+
+
+def _alter_depthwise_conv2d_layout(depthwise_conv2d):
+    cast_op = depthwise_conv2d.args[0]
+    requantize_op = cast_op.args[0]
+    add_op = requantize_op.args[0]
+    prev_conv2d_op = add_op.args[0]
+
+    return relay.qnn.op.conv2d(
+        relay.layout_transform(
+            relay.cast(
+                relay.qnn.op.requantize(
+                    relay.op.add(
+                        relay.qnn.op.conv2d(
+                            *prev_conv2d_op.args,
+                            **edit_attrs(prev_conv2d_op.attrs, out_layout="NCHW"),
+                        ),
+                        relay.layout_transform(
+                            add_op.args[1],
+                            src_layout="NHWC",
+                            dst_layout="NCHW",
+                        ),
+                    ),
+                    *requantize_op.args[1:],
+                    **edit_attrs(requantize_op.attrs, axis=1),
+                ),
+                dtype="int16",
+            ),
+            src_layout="NCHW",
+            dst_layout="NHWC",
+        ),
+        *depthwise_conv2d.args[1:],
+        **edit_attrs(depthwise_conv2d.attrs, data_layout="NCHW"),
+    )
 
 
-def _is_qnn_op_depthwise_conv2d(qnn_conv2d_op):
-    return relay.op.strategy.generic.is_depthwise_conv2d(
-        qnn_conv2d_op.args[0].type_annotation.shape,
-        qnn_conv2d_op.attrs.data_layout,
-        qnn_conv2d_op.args[1].data.shape,
-        qnn_conv2d_op.attrs.kernel_layout,
-        qnn_conv2d_op.attrs.groups,
+@qnn_conv2d_alter_layout.register(["arm_cpu"])
+def alter_conv2d_layout(attrs, inputs, _tinfos, _out_type):
+    """Adjust a qnn.conv2d and preceeding ops to better fit on Cortex-M."""
+    current_target = target.Target.current(allow_none=False)
+    if not "cortex-m" in current_target.mcpu:
+        return None
+
+    # Always cast to int16 and pick a our desired kernel layout - this won't affect anything
+    data_expr, kernel_expr = inputs[:2]
+    is_depthwise = attrs.groups > 1
+    new_kernel_layout = "IOHW" if is_depthwise else "OHWI"
+
+    op = relay.qnn.op.conv2d(
+        relay.cast(data_expr, dtype="int16"),
+        relay.cast(kernel_expr, dtype="int16"),
+        *inputs[2:],
+        **edit_attrs(attrs, kernel_layout=new_kernel_layout, out_layout="NHWC"),
     )
 
+    # If possible, modify depthwise ops to take as input NCHW instead.
+    if is_depthwise and prev_ops_match(op.args[0], ("cast", "qnn.requantize", "add", "qnn.conv2d")):
+        op = _alter_depthwise_conv2d_layout(op)
+
+    return op
+
 
 @qnn_add_alter_layout.register(["arm_cpu"])
 def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
@@ -78,11 +128,8 @@ def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
     Currently only supports qnn.conv2d, but qnn.dense support should be added. Note that this
     optimization means we must pad tensors with the input zero point, and NOT with zero.
     """
-
     prev_op, biases_data_op = inputs
-    if not hasattr(prev_op, "op"):
-        return None
-    if prev_op.op.name != "qnn.conv2d":
+    if not prev_ops_match(inputs[0], ("qnn.conv2d",)):
         return None
 
     # We should not perform this alteration if the target has a uint * int SIMD MAC operation (since
@@ -93,9 +140,9 @@ def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
         return None
 
     conv_input_zp = prev_op.args[2].data.numpy().item()
-    kernel = prev_op.args[1].data.numpy()
+    kernel = _squash_transformations(prev_op.args[1])
 
-    if _is_qnn_op_depthwise_conv2d(prev_op):
+    if prev_op.attrs.groups == prev_op.attrs.channels:
         axes_to_sum = "HW"
     elif prev_op.attrs.groups == 1:
         axes_to_sum = "HWI"
@@ -134,3 +181,44 @@ def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
         new_biases_op = bias_constant
 
     return relay.add(new_conv_op, new_biases_op)
+
+
+@qnn_requantize_alter_layout.register(["arm_cpu"])
+def alter_requantize_layout(attrs, inputs, _tinfos, _out_type):
+    """Changes a floating point requantize op to use int64 multiply + shift for microTVM.
+
+    Usually, this is done by QNN legalization. However, microTVM wants to manually choose the
+    integer rounding constants in order to:
+        (a) Have int32, not int64 constants
+        (b) Use a constant rounding shift to skip a memory load.
+
+    Ideally, we would pick these constants in the requantize (or fused) schedule. Unfortunately that
+    is not currently possible, so we pick them with `alter_layout` as a hack. This will only work if
+    the requantize schedule "plays along" with this hack.
+    """
+
+    # Only microTVM Cortex-M boards with DSP use the relevant schedules
+    current_target = target.Target.current(allow_none=False)
+    if not (current_target.features.has_dsp and "cortex-m" in current_target.mcpu):
+        return None
+
+    _, in_scale, _, out_scale, _ = inputs
+    in_scale_numpy = in_scale.data.numpy().astype("float64")
+    out_scale_scalar = out_scale.data.numpy().item()
+
+    # Shifting by 33 and rounding means shifting by 32, adding 1, and shifting by 1 again. This is
+    # useful, because shifting a multiplication product by 32 can be done for "free" with SMMUL
+    scales = ((in_scale_numpy / out_scale_scalar) * 2**33).astype("int32")
+
+    # Requantize ops in Relay do not support int32 scales - if we try to use one, requantize.cc will
+    # raise an error. As a hacky work-around, we change the scale dtype to float32, without changing
+    # underlying data. This works, as our compute function knows to interpret the scale as an int32.
+
+    # This is only a work-around - a better long-term solution would be adding a new integer
+    # requantize op, which takes integer scales, shifts, and rounding behavior.
+    fake_float_scales = scales.view("float32")
+
+    scale_constant = relay.Constant(nd.array(fake_float_scales))
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+    new_attrs["out_dtype"] = "int16"
+    return relay.qnn.op.requantize(inputs[0], scale_constant, *inputs[2:], **new_attrs)
diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
new file mode 100644
index 000000000000..447b86f464fa
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/qnn_legalize.py
@@ -0,0 +1,340 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""QNN legalization transforms that help eliminate sparse channels.
+
+Some models (like MobileNetV1 when fine-tuned) have output channels in their kernels which are
+completely full of zeros. Sometimes these can be optimized away by the C compiler, but this does not
+happen when complex schedules (like the ACLE tensordot convolutions) are used.
+
+Instead, we will remove these channels by replacing blocks of operators with equivalent "denser"
+ones during legalization. This is harder than it looks - while the outputs of channels with all-zero
+kernels do not depend on the input data, they are usually not zero. We work around this by computing
+how these constant values affect subsequent operators, and "folding" these effects into a bias_add.
+
+It would eventually be nice to have a generalized, cross-target solution for removing zero channels,
+as there is no downside. This may be possible with Relax, but I'm unsure.
+"""
+
+from scipy.signal import convolve2d
+from tvm.topi.utils import get_const_tuple
+from tvm import nd
+from .qnn_alter_op import prev_ops_match, edit_attrs
+from ..nn import *
+
+
+def _compute_fixed_conv2d_outputs(requantize_op):
+    """Compute all conv2d output values that do not depend on the layer input."""
+    bias_add_op = requantize_op.args[0]
+    conv2d_op = bias_add_op.args[0]
+
+    assert conv2d_op.attrs.kernel_layout.isalpha()
+    assert conv2d_op.attrs.groups == 1
+    kernel = conv2d_op.args[1].data.numpy()
+    oc_axis = conv2d_op.attrs.kernel_layout.index("O")
+
+    num_channels = kernel.shape[oc_axis]
+    rq_input_scale = requantize_op.args[1].data.numpy()
+    rq_output_scale = requantize_op.args[3].data.numpy().item()
+    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
+    bias_data = bias_add_op.args[1].data.numpy()
+
+    fixed_outputs = {}
+
+    for i in range(num_channels):
+        if np.any(np.take(kernel, i, axis=oc_axis)):
+            continue
+        scale = rq_input_scale[i] / rq_output_scale
+        channel_constant = round(bias_data[i] * scale + rq_output_zero_point)
+        clipped = min(127, max(-128, channel_constant))
+        fixed_outputs[i] = clipped
+
+    return fixed_outputs
+
+
+def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
+    """Compute all depthwise conv2d output values that do not depend on the PREVIOUS layer input."""
+    bias_add_op = requantize_op.args[0]
+    depthwise_op = bias_add_op.args[0]
+
+    assert depthwise_op.attrs.kernel_layout.isalpha()
+    assert depthwise_op.attrs.groups > 1
+    kernel = depthwise_op.args[1].data.numpy()
+    oc_axis = depthwise_op.attrs.kernel_layout.index("O")
+
+    conv_input_zero_point = depthwise_op.args[2].data.numpy().item()
+    rq_input_scale = requantize_op.args[1].data.numpy()
+    rq_output_scale = requantize_op.args[3].data.numpy().item()
+    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
+    bias_data = bias_add_op.args[1].data.numpy()
+
+    kernel_size = get_const_tuple(depthwise_op.attrs.kernel_size)
+    pad_top, pad_left, pad_bottom, pad_right = get_const_tuple(depthwise_op.attrs.padding)
+
+    # Make a kernel_size x kernel_size array of fixed_input
+    # Pad it with zeros usint padding
+    # Do a convolution and make sure
+
+    fixed_outputs = {}
+
+    for i, fixed_input in fixed_channel_inputs.items():
+        input_array = np.full(kernel_size, fixed_input, dtype="int32") - conv_input_zero_point
+        kernel_channel = np.take(kernel, i, axis=oc_axis).reshape(kernel_size)
+        scale = rq_input_scale[i] / rq_output_scale
+
+        convolved = convolve2d(input_array, kernel_channel, mode="same")
+        rounded = np.around((convolved + bias_data[i]) * scale).astype("int32")
+        clipped = np.clip(rounded + rq_output_zero_point, -128, 127)
+
+        # TODO - this is kinda paranoid and I think it removes too many. Make sure this is accurate.
+        # This bug likely causes a ~5% performance reduction
+        if np.all(clipped == clipped[0, 0]):
+            fixed_outputs[i] = clipped[0, 0]
+        else:
+            # TODO remove
+            assert True
+
+    # TODO look for all-zero entries in the depthwise kernel. I don't think these
+    # really occur in practice, but it would be nice for theoretical completeness.
+
+    return fixed_outputs
+
+
+def _excise_conv2d_channels(empty_channels, input_op, requantize_op, is_depthwise=False):
+    bias_add_op = requantize_op.args[0]
+    conv2d_op = bias_add_op.args[0]
+    axis = conv2d_op.attrs.kernel_layout.index("O")
+
+    kernel_data = np.delete(conv2d_op.args[1].data.numpy(), empty_channels, axis=axis)
+    bias_data = np.delete(bias_add_op.args[1].data.numpy(), empty_channels)
+    in_scale_data = np.delete(conv2d_op.args[5].data.numpy(), empty_channels)
+    out_scale_data = np.delete(requantize_op.args[1].data.numpy(), empty_channels)
+    num_channels = kernel_data.shape[axis]
+    if is_depthwise:
+        num_groups = num_channels
+    else:
+        num_groups = 1
+
+    return relay.qnn.op.requantize(
+        relay.nn.bias_add(
+            relay.qnn.op.conv2d(
+                input_op,
+                relay.Constant(nd.array(kernel_data)),
+                *conv2d_op.args[2:5],
+                relay.Constant(nd.array(in_scale_data)),
+                **edit_attrs(conv2d_op.attrs, channels=num_channels, groups=num_groups),
+            ),
+            relay.Constant(nd.array(bias_data)),
+            **bias_add_op.attrs,
+        ),
+        relay.Constant(nd.array(out_scale_data)),
+        *requantize_op.args[2:],
+        **requantize_op.attrs,
+    )
+
+
+def _excise_avg_pool_channels(empty_channels, input_op, first_reshape_op, axis=1):
+    outer_cast = first_reshape_op.args[0].args[0]
+    avg_pool = outer_cast.args[0]
+    inner_cast = avg_pool.args[0]
+
+    new_shape = list(get_const_tuple(first_reshape_op.attrs.newshape))
+    new_shape[axis] -= len(empty_channels)
+
+    return relay.reshape(
+        relay.cast(
+            relay.nn.avg_pool2d(relay.cast(input_op, **inner_cast.attrs), **avg_pool.attrs),
+            **outer_cast.attrs,
+        ),
+        **edit_attrs(first_reshape_op.attrs, newshape=new_shape),
+    )
+
+
+def _fold_into_conv_bias(fixed_inputs, conv2d_op, input_op):
+    assert not any(get_const_tuple(conv2d_op.attrs.padding))
+    in_axis = conv2d_op.attrs.kernel_layout.index("I")
+    out_axis = conv2d_op.attrs.kernel_layout.index("O")
+
+    kernel = conv2d_op.args[1].data.numpy()
+    zero_point = conv2d_op.args[2].data.numpy().item()
+
+    extra_bias = np.zeros((kernel.shape[out_axis],), dtype="int32")
+
+    # For every output channel
+    for i in range(kernel.shape[out_axis]):
+        out_kernel_slice = np.expand_dims(np.take(kernel, i, axis=out_axis), axis=out_axis)
+
+        # For every input channel that is being removed:
+        for j, val in fixed_inputs.items():
+            kernel_slice = np.take(out_kernel_slice, j, axis=in_axis)
+            accumulator = np.sum(kernel_slice * (val - zero_point))
+            extra_bias[i] += accumulator
+
+    stripped_kernel = np.delete(kernel, tuple(fixed_inputs.keys()), axis=in_axis)
+    new_conv = relay.qnn.op.conv2d(
+        input_op,
+        relay.Constant(nd.array(stripped_kernel)),
+        *conv2d_op.args[2:],
+        **conv2d_op.attrs,
+    )
+
+    return new_conv, extra_bias
+
+
+def _fold_into_dense_bias(fixed_inputs, dense_op, input_op, channel_axis=1):
+    weights = dense_op.args[1].data.numpy()
+    assert channel_axis < 2
+    assert len(weights.shape) == 2
+    zero_point = dense_op.args[2].data.numpy().item()
+
+    extra_bias = np.zeros((weights.shape[1 - channel_axis],), dtype="int32")
+
+    # For every output channel
+    for i in range(weights.shape[1 - channel_axis]):
+        out_weights_slice = np.take(weights, i, axis=1 - channel_axis)
+
+        # For every input channel that is being removed:
+        for j, val in fixed_inputs.items():
+            weight = out_weights_slice[j]
+            extra_bias[i] += (val - zero_point) * weight
+
+    stripped_weights = np.delete(weights, tuple(fixed_inputs.keys()), axis=channel_axis)
+    new_dense = relay.qnn.op.dense(
+        input_op,
+        relay.Constant(nd.array(stripped_weights)),
+        *dense_op.args[2:],
+        **dense_op.attrs,
+    )
+
+    return new_dense, extra_bias
+
+
+def _densify_conv_depthwise_conv_pattern(attrs, inputs):
+    """Rewrites a regular -> depthwise -> regular convolution pattern to excise empty out channels.
+
+    Should be called as part of legalization (before dtypes and layouts are rewritten) and with the
+    BIAS ADD OPERATOR'S (the one we'll use to "fold in" our constants) `attrs` and `inputs`. The
+    last regular conv2d operator must be unpadded.
+    """
+    current_conv = inputs[0]
+    depthwise_requantize = current_conv.args[0]
+    top_requantize = depthwise_requantize.args[0].args[0].args[0]
+    top_conv2d = top_requantize.args[0].args[0]
+
+    fixed_conv2d_outputs = _compute_fixed_conv2d_outputs(top_requantize)
+    fixed_dw_outputs = _compute_fixed_depthwise_outputs(depthwise_requantize, fixed_conv2d_outputs)
+
+    # Ensure number of channels is divisible by two
+    if len(fixed_dw_outputs) % 2 > 0:
+        fixed_dw_outputs.popitem()
+
+    if not fixed_dw_outputs:
+        return None
+
+    unneeded_channels = tuple(fixed_dw_outputs.keys())
+    new_top_conv2d = _excise_conv2d_channels(unneeded_channels, top_conv2d.args[0], top_requantize)
+    new_dw_conv2d = _excise_conv2d_channels(
+        unneeded_channels, new_top_conv2d, depthwise_requantize, is_depthwise=True
+    )
+    new_conv, extra_bias = _fold_into_conv_bias(fixed_dw_outputs, current_conv, new_dw_conv2d)
+
+    new_bias = inputs[1].data.numpy() + extra_bias
+    new_op = relay.nn.bias_add(new_conv, relay.Constant(nd.array(new_bias)), **attrs)
+    return new_op
+
+
+def _densify_conv_pool_dense_pattern(attrs, inputs):
+    """Rewrites a regular conv -> pool -> dense pattern to excise empty out channels from the conv.
+
+    Should be called as part of legalization (before dtypes and layouts are rewritten) and with the
+    BIAS ADD operator's `attrs` and `inputs` (the one we'll use to "fold in" our constants). The
+    average pool operator must reduce the height and width dimensions to 1x1.
+    """
+    first_reshape = inputs[0].args[0]
+    top_requantize = first_reshape.args[0].args[0].args[0].args[0].args[0]
+    top_conv2d = top_requantize.args[0].args[0]
+
+    fixed_conv2d_outputs = _compute_fixed_conv2d_outputs(top_requantize)
+
+    # Ensure number of channels is divisible by two
+    if len(fixed_conv2d_outputs) % 2 > 0:
+        fixed_dw_outputs.popitem()
+
+    if not fixed_conv2d_outputs:
+        return None
+
+    unneeded_channels = tuple(fixed_conv2d_outputs.keys())
+    new_top_conv2d = _excise_conv2d_channels(unneeded_channels, top_conv2d.args[0], top_requantize)
+    new_avg_pool = _excise_avg_pool_channels(unneeded_channels, new_top_conv2d, first_reshape)
+    new_conv, extra_bias = _fold_into_dense_bias(fixed_conv2d_outputs, inputs[0], new_avg_pool)
+
+    new_bias = inputs[1].data.numpy() + extra_bias
+    new_op = relay.nn.bias_add(new_conv, relay.Constant(nd.array(new_bias)), **attrs)
+    return new_op
+
+
+@qnn_bias_add_legalize.register(["arm_cpu"])
+def legalize_bias_add(attrs, inputs, _tinfos):
+    """Remove empty convolution channels when possible, and "fold" them into the bias add.
+
+    TODO these rewrites are always beneficial and will improve performance cross-platform - should
+    we enable them for all platforms, not just arm_cpu?
+    """
+
+    if prev_ops_match(
+        inputs[0],
+        (
+            "qnn.conv2d",
+            "qnn.requantize",
+            "nn.bias_add",
+            "qnn.conv2d",
+            "qnn.requantize",
+            "nn.bias_add",
+            "qnn.conv2d",
+        ),
+    ):
+        current_conv = inputs[0]
+        depthwise_conv2d = current_conv.args[0].args[0].args[0]
+        top_conv2d = depthwise_conv2d.args[0].args[0].args[0]
+        if (
+            not any(get_const_tuple(current_conv.attrs.padding))
+            and current_conv.attrs.groups == 1
+            and depthwise_conv2d.attrs.groups > 1
+            and top_conv2d.attrs.groups == 1
+        ):
+            return _densify_conv_depthwise_conv_pattern(attrs, inputs)
+
+    if prev_ops_match(
+        inputs[0],
+        (
+            "qnn.dense",
+            "reshape",
+            "reshape",
+            "cast",
+            "nn.avg_pool2d",
+            "cast",
+            "qnn.requantize",
+            "nn.bias_add",
+            "qnn.conv2d",
+        ),
+    ):
+        avg_pool = inputs[0].args[0].args[0].args[0].args[0]
+        top_requantize = avg_pool.args[0].args[0]
+        top_conv2d = top_requantize.args[0].args[0]
+        if top_conv2d.attrs.groups == 1:
+            return _densify_conv_pool_dense_pattern(attrs, inputs)
+
+    return None
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
index 9aaa452a7392..1f624f3b3466 100644
--- a/python/tvm/topi/nn/qnn.py
+++ b/python/tvm/topi/nn/qnn.py
@@ -191,8 +191,8 @@ def _dispatch_sim_dequantize(value):
 
 
 @tvm.target.generic_func
-def qnn_requantize_alter_layout(_attrs, _inputs, _tinfos, _out_type):
-    """Change requantize layout.
+def qnn_conv2d_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change qnn.conv2d layout.
 
     Parameters
     ----------
@@ -212,6 +212,26 @@ def qnn_requantize_alter_layout(_attrs, _inputs, _tinfos, _out_type):
     return None
 
 
+@tvm.target.generic_func
+def qnn_bias_add_legalize(_attrs, _inputs, _tinfos):
+    """Legalize bias_add layout.
+
+    Bias add is not a QNN-specific function, but this generic exists so that empty channels can
+    be excised from quantized conv2d operators and folded into bias adds.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+
+    """
+    return None
+
+
 @tvm.target.generic_func
 def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
     """Change add layout.
@@ -239,9 +259,8 @@ def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
 
 
 @tvm.target.generic_func
-def qnn_conv2d_alter_layout(_attrs, _inputs, _tinfos, _out_type):
-    """Change qnn.conv2D layout.
-    Not to change by default
+def qnn_requantize_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change requantize layout.
 
     Parameters
     ----------
@@ -253,6 +272,10 @@ def qnn_conv2d_alter_layout(_attrs, _inputs, _tinfos, _out_type):
         Input shape and dtype
     out_type: type
         The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
     """
     return None
 
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 4ff8a59b349e..8421721b2777 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -277,6 +277,7 @@ Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm) {
   }
 
   // Fast math optimizations.
+  pass_seqs.push_back(transform::SimplifyExpr());
   pass_seqs.push_back(transform::FastMath());
   pass_seqs.push_back(transform::FoldConstant());
 
@@ -416,7 +417,7 @@ Optional<tir::PrimFunc> DefaultTIRConverterImpl(const Array<te::Tensor>& args,
       return NullOpt;
     }
   }
-  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants, DataType::Int(64));
+  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants, DataType::Int(32));
   bool dynamic_loop_extent = false;
   tir::PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void {
     if (const auto* loop = obj.as<tir::ForNode>()) {
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 6bf14424bf38..523bb51edf71 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -26,6 +26,7 @@
 
 #include <cctype>
 #include <iomanip>
+#include <cstdlib>
 
 #include "../../arith/pattern_match.h"
 #include "codegen_params.h"
@@ -632,7 +633,8 @@ void CodeGenC::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs,
 }
 
 void CodeGenC::VisitStmt_(const AllocateConstNode* op) {
-  std::string symbol_name = op->buffer_var->name_hint;
+  int suffix = rand() % (2 << 24);
+  std::string symbol_name = op->buffer_var->name_hint + "_" + std::to_string(suffix);
   int64_t num_elements = 1;
   const auto& data = op->data.value();
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index 6e7fc38fa349..f1065d88937e 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -95,6 +95,63 @@ def _get_mobilenet_v1_layer_attributes(layer_num):
     return ((1, 1, 1, 1), (1, 1), True)
 
 
+@pytest.mark.parametrize("layer", range(2, 27, 2))
+def test_infinite_bias_detection(interpreter, layer):
+    """Some models (mainly MobileNetV1) have kernels with many output channels full entirely of
+    zeroes. The VWW mdoel is one of these. This test confirms that the outputs of these channels,
+    as computed by TensorFlow, are indeed not dependent upon the input values.
+    """
+
+    _, kernel, bias, output = _load_tflite_layer(interpreter, layer)
+    kernel_data, kernel_quant = kernel
+    bias_data, bias_quant = bias
+    output_data, output_quant = output
+    is_depthwise = _get_mobilenet_v1_layer_attributes(layer)[2]
+    assert not is_depthwise
+    assert kernel_data.shape[1] == kernel_data.shape[2] == 1
+
+    out_channels = kernel_data.shape[3]
+    fixed_channels = {}
+
+    out_zero_point = output_quant["zero_points"][0]
+    assert out_zero_point == -128
+
+    for i in range(out_channels):
+        # Skip over output channels with data
+        if np.any(kernel_data[i, 0, 0, :]):
+            continue
+
+        scale = bias_quant["scales"][i] / output_quant["scales"][0]
+        channel_constant = round(bias_data[i] * scale + out_zero_point)
+        clipped = min(127, max(-128, channel_constant))
+
+        out_channel_values = output_data[0, :, :, i].flatten()
+        assert all(x == clipped for x in out_channel_values)
+        fixed_channels[i] = clipped
+    print(f"Layer {layer} had {len(fixed_channels)}/{out_channels} empty!")
+
+    # We now need to compute values for the following depthwise layer
+    if layer == 26:
+        return
+
+    _, kernel, bias, output = _load_tflite_layer(interpreter, layer + 1)
+    kernel_data, kernel_quant = kernel
+    bias_data, bias_quant = bias
+    output_data, output_quant = output
+    is_depthwise = _get_mobilenet_v1_layer_attributes(layer + 1)[2]
+    assert is_depthwise
+
+    for i, value in fixed_channels.items():
+        print(i)
+        assert np.all(output_data[:, :, :, i] == output_data[0, 0, 0, i])
+        print(output_data[0, 0, 0, i])
+
+    for i in range(out_channels):
+        print(kernel_data.shape)
+        # Skip over output channels with data
+        assert np.any(kernel_data[:, :, :, i])
+
+
 def _get_relu_activation_prefix(layer_num):
     if layer_num == 0:
         return "model/activation/Relu;"
@@ -206,7 +263,7 @@ def _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams, is_depthwise=
         kernel_layout="IOHW" if is_depthwise else kernel_layout,
         dilation=(1, 1),
         strides=strides,
-        padding=padding,
+        padding=(0, 0, 0, 0),
         groups=groups,
         channels=channels,
         out_dtype="int32",

From 5c67f472e24005f47b50f923c7a337dfbfd50daf Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Wed, 11 Jan 2023 14:46:44 -0800
Subject: [PATCH 04/19] Add support for layer 25

Sketchy support for layer 25

Add nice layer 25!
---
 python/tvm/relay/qnn/strategy/arm_cpu.py      |  24 +-
 python/tvm/topi/arm_cpu/qnn.py                | 241 +++++++++++++-----
 .../arm_cpu/test_quantized_convolution.py     |  19 +-
 3 files changed, 203 insertions(+), 81 deletions(-)

diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
index f9e7c7abac68..187f1c46de15 100644
--- a/python/tvm/relay/qnn/strategy/arm_cpu.py
+++ b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -24,6 +24,7 @@
 from .generic import *
 from ... import op as _op
 from ...op.strategy.generic import is_depthwise_conv2d
+from tvm.topi.utils import get_const_tuple
 
 
 @qnn_quantize_strategy.register("arm_cpu")
@@ -89,7 +90,7 @@ def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
     strategy = _op.OpStrategy()
 
     if groups == 1:
-        if True or (data_layout == "NHWC" and kernel_layout == "OHWI"):
+        if data_layout == "NHWC" and kernel_layout == "OHWI":
             strategy.add_implementation(
                 topi.arm_cpu.qnn_conv2d,
                 topi.arm_cpu.schedule_qnn_conv2d,
@@ -98,12 +99,21 @@ def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
         else:
             raise TVMError("QNN regular Conv2D for Arm Cortex-M DSP got incorrect input layout!")
     elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
-        if True or (data_layout == "NCHW" and kernel_layout == "IOHW"):
-            strategy.add_implementation(
-                topi.arm_cpu.qnn_depthwise_conv2d,
-                topi.arm_cpu.schedule_qnn_depthwise_conv2d,
-                name="qnn_depthwise_conv2d.arm_cpu",
-            )
+        if data_layout == "NCHW" and kernel_layout == "IOHW":
+            height, width = data.shape[2:]
+            y_stride, x_stride = get_const_tuple(attrs.strides)
+            if height * width * y_stride % 2 == 0:
+                strategy.add_implementation(
+                    topi.arm_cpu.qnn_depthwise_conv2d,
+                    topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                    name="qnn_depthwise_conv2d.arm_cpu",
+                )
+            else:
+                strategy.add_implementation(
+                    topi.arm_cpu.qnn_explicit_depthwise_conv2d,
+                    topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                    name="qnn_depthwise_conv2d.arm_cpu",
+                )
         else:
             raise TVMError("QNN depthwise Conv2D for Arm Cortex-M DSP got incorrect input layout!")
     else:
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
index 7fa80b99a939..84a907ca3d7a 100644
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -20,15 +20,16 @@
 Arm Cortex-M with DSP.
 """
 
-from typing import Callable, Tuple
+from typing import Callable, Dict, Tuple
 
 import tvm
 from tvm import te, TVMError
-from tvm.tir import const
 from tvm.script import tir as T
+from tvm.tir import const
+from tvm.topi.nn.pad import pad
+
 from ..utils import get_const_tuple
 from .mprofile.dsp.micro_kernel import tensordot
-from tvm.topi.nn.pad import pad
 
 
 def int_ceil_division(x, y):
@@ -121,6 +122,25 @@ def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False):
     return (aligned_func, offset_func)
 
 
+def _optionally_pad_data(data, layout, padding, pad_value=-128):
+    # rq_input_zero_point_const = inputs[8].op.body[0]
+
+    padding = get_const_tuple(padding)
+    if any(padding):
+        pad_before = [0, 0, 0, 0]
+        pad_after = [0, 0, 0, 0]
+
+        pad_up, pad_left, pad_down, pad_right = padding
+        pad_before[layout.index("H")] = pad_up
+        pad_after[layout.index("H")] = pad_down
+        pad_before[layout.index("W")] = pad_left
+        pad_after[layout.index("W")] = pad_right
+        return pad(data, pad_before, pad_after, pad_value)
+
+    else:
+        return data
+
+
 def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
     return T.tvm_access_ptr(
         T.type_annotation(dtype=dtype),
@@ -132,6 +152,14 @@ def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
     )
 
 
+def _bias_ptr(bias, c):
+    return _make_tscript_ptr(bias, c, 1, dtype="int32")
+
+
+def _scale_ptr(scale, c):
+    return _make_tscript_ptr(scale, c, 1, dtype="int32")
+
+
 def _make_tscript_call(func_name, *args):
     return T.evaluate(T.call_extern(func_name, *args, dtype="int32"))
 
@@ -176,12 +204,6 @@ def output_ptr(output, y, x, c):
         else:
             raise TVMError(f"Unsupported out_layout '{output_layout}'!")
 
-    def bias_ptr(bias, c):
-        return _make_tscript_ptr(bias, c, 1, dtype="int32")
-
-    def scale_ptr(scale, c):
-        return _make_tscript_ptr(scale, c, 1, dtype="int32")
-
     @T.prim_func
     def biased_quantized_conv2d(
         data_handle: T.handle,
@@ -217,14 +239,15 @@ def biased_quantized_conv2d(
         ):
             with T.block("conv2d_aligned"):
                 T.block_attr({"pragma_import_c": aligned_func_code})
-                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                y, x, c_interval = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                c = c_interval * c_step
                 _make_tscript_call(
                     aligned_func_name,
-                    output_ptr(output, y, x, c * c_step),
-                    data_ptr(data, y, x, c * c_step),
-                    kernel_ptr(kernel, c * c_step),
-                    bias_ptr(bias, c * c_step),
-                    scale_ptr(scale, c * c_step),
+                    output_ptr(output, y, x, c),
+                    data_ptr(data, y, x, c),
+                    kernel_ptr(kernel, c),
+                    _bias_ptr(bias, c),
+                    _scale_ptr(scale, c),
                 )
 
         for c_ax, y_ax, x_ax in T.grid(
@@ -232,14 +255,15 @@ def biased_quantized_conv2d(
         ):
             with T.block("conv2d_offset"):
                 T.block_attr({"pragma_import_c": offset_func_code})
-                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                y, x, c_interval = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                c = c_interval * c_step + 1
                 _make_tscript_call(
                     offset_func_name,
-                    output_ptr(output, y, x, c * c_step + 1),
-                    data_ptr(data, y, x, c * c_step + 1, offset=1),
-                    kernel_ptr(kernel, c * c_step + 1, offset=1),
-                    bias_ptr(bias, c * c_step + 1),
-                    scale_ptr(scale, c * c_step + 1),
+                    output_ptr(output, y, x, c),
+                    data_ptr(data, y, x, c, offset=1),
+                    kernel_ptr(kernel, c, offset=1),
+                    _bias_ptr(bias, c),
+                    _scale_ptr(scale, c),
                 )
 
     return biased_quantized_conv2d
@@ -257,29 +281,13 @@ def qnn_conv2d(attrs, inputs, out_type):
     # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper.
     assert len(inputs) == 11
     assert attrs
-    import pdb
 
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-
+    padded_data = _optionally_pad_data(data, attrs.data_layout, attrs.padding)
+    _, height, width, in_channels = get_const_tuple(padded_data.shape)
     out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
-    y_stride, x_stride = get_const_tuple(attrs.strides)
 
-    # rq_input_zero_point_const = inputs[8].op.body[0]
-    rq_input_zero_point_const = -128
-
-    padding = get_const_tuple(attrs.padding)
-    if any(padding):
-        pad_up, pad_left, pad_down, pad_right = padding
-        padded_data = pad(
-            data,
-            [0, pad_up, pad_left, 0],
-            [0, pad_down, pad_right, 0],
-            rq_input_zero_point_const,
-        )
-    else:
-        padded_data = data
-
-    _, height, width, in_channels = get_const_tuple(padded_data.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
     out_height = _compute_output_dim(height, kernel_h, y_stride)
     out_width = _compute_output_dim(width, kernel_w, x_stride)
 
@@ -351,25 +359,11 @@ def qnn_depthwise_conv2d(attrs, inputs, out_type):
 
     assert len(inputs) == 11
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-
+    padded_data = _optionally_pad_data(data, attrs.data_layout, attrs.padding)
+    _, _, height, width = get_const_tuple(padded_data.shape)
     _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-    y_stride, x_stride = get_const_tuple(attrs.strides)
 
-    # rq_input_zero_point_const = inputs[8].op.body[0]
-    rq_input_zero_point_const = -128
-    padding = get_const_tuple(attrs.padding)
-    if any(padding):
-        pad_up, pad_left, pad_down, pad_right = padding
-        padded_data = pad(
-            data,
-            [0, 0, pad_up, pad_left],
-            [0, 0, pad_down, pad_right],
-            rq_input_zero_point_const,
-        )
-    else:
-        padded_data = data
-
-    _, _, height, width = get_const_tuple(padded_data.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
     out_height = _compute_output_dim(height, kernel_h, y_stride)
     out_width = _compute_output_dim(width, kernel_w, x_stride)
 
@@ -415,3 +409,136 @@ def kernel_ptr(buffer, c, offset=0):
 def schedule_qnn_depthwise_conv2d(_attrs, _outs, _target):
     """Schedule function for qnn.depthwise_conv2d."""
     return None
+
+
+def _make_unrolled_conv2d_primfunc(
+    output_dimensions: Tuple[int, int, int],
+    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
+    function_names: Dict[Tuple, str],
+    function_code: str,
+    ptr_gens: Tuple[Callable, Callable],
+    output_layout="NHWC",
+):
+    out_height, out_width, out_channels = output_dimensions
+    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
+    data_ptr, kernel_ptr = ptr_gens
+
+    def output_ptr(output, y, c):
+        if output_layout == "NHWC":
+            return _make_tscript_ptr(output, y * const(out_width * out_channels) + c, 1)
+        elif output_layout == "NCHW":
+            return _make_tscript_ptr(
+                output, c * const(out_height * out_width) + y * const(out_width), 1
+            )
+        else:
+            raise TVMError(f"Unsupported out_layout '{output_layout}'!")
+
+    def _make_row_call(buffers, c_var, y, c):
+        output, data, kernel, bias, scale = buffers
+        return _make_tscript_call(
+            function_names[(y + c) % 2, c % 2, 0],
+            output_ptr(output, y, c_var + c),
+            data_ptr(data, y, c_var + c, offset=(y + c) % 2),
+            kernel_ptr(kernel, c_var + c, offset=c),
+            _bias_ptr(bias, c_var + c),
+            _scale_ptr(scale, c_var + c),
+        )
+
+    @T.prim_func
+    def biased_quantized_conv2d(
+        data_handle: T.handle,
+        kernel_handle: T.handle,
+        bias_handle: T.handle,
+        scale_handle: T.handle,
+        output_handle: T.handle,
+    ) -> None:
+        # Same setup is used as in _make_conv2d_primfunc
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        data = T.match_buffer(data_handle, data_shape, dtype="int16")
+        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
+        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
+        scale = T.match_buffer(scale_handle, scale_shape)
+        output = T.match_buffer(output_handle, output_shape, dtype="int16")
+
+        # pylint: disable=unused-variable
+        output[0, 0, 0, 0] = 0
+        __1 = data[0, 0, 0, 0]
+        __2 = kernel[0, 0, 0, 0]
+        __3 = bias[0, 0, 0, 0]
+        __4 = scale[0]
+        # pylint: enable=unused-variable
+
+        for c_ax in T.grid(out_channels // 2):
+            with T.block("conv2ds"):
+                T.block_attr({"pragma_import_c": function_code})
+                c = T.axis.remap("S", [c_ax]) * 2
+                _make_row_call((output, data, kernel, bias, scale), c, 0, 0)
+                _make_row_call((output, data, kernel, bias, scale), c, 1, 0)
+                _make_row_call((output, data, kernel, bias, scale), c, 2, 0)
+                _make_row_call((output, data, kernel, bias, scale), c, 0, 1)
+                _make_row_call((output, data, kernel, bias, scale), c, 1, 1)
+                _make_row_call((output, data, kernel, bias, scale), c, 2, 1)
+
+    return biased_quantized_conv2d
+
+
+def qnn_unrolled_depthwise_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.depthwise_conv2d with NCHW layout for convolutions with small width, height.
+
+    Behaves similarly to qnn_depthwise_conv2d, but does not iterate over the output width and height
+    and instead calls these functions explicitly. This gives a tiny performance boost in exchange
+    for larger code size, but more importantly does not require out_width * out_height
+    * y_stride % 2 == 0. This does, however, require y_stride == x_stride == 1.
+    """
+
+    assert len(inputs) == 11
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+    assert y_stride == x_stride == 1
+
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    padded_data = _optionally_pad_data(data, attrs.data_layout, attrs.padding)
+    _, _, height, width = get_const_tuple(padded_data.shape)
+    _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    rq_output_zero_point_const = inputs[10]
+    assert len(rq_output_zero_point_const.op.body) == 1
+    output_zero_point = rq_output_zero_point_const.op.body[0]
+
+    dimensions = (width, kernel_h, kernel_w)
+    x_strides = (1, out_channels)
+
+    # data, kernel
+    func_names = {}
+    impls = []
+    for alignment in ((0, 0, 0), (0, 1, 0), (1, 0, 0), (1, 1, 0)):
+        func_name, impl = tensordot.tensordot_int16_impl(
+            out_width, dimensions, alignment, x_strides, output_zero_point=output_zero_point
+        )
+        func_names[alignment] = func_name
+        impls.append(impl)
+
+    def data_ptr(buffer, y, c, offset=0):
+        return _make_tscript_ptr(buffer, c * const(width * height) + y * const(width) - offset, 1)
+
+    def kernel_ptr(buffer, c, offset=0):
+        return _make_tscript_ptr(buffer, c * const(kernel_h * kernel_w) - offset, 1)
+
+    prim_func = _make_unrolled_conv2d_primfunc(
+        (out_height, out_width, out_channels),
+        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
+        func_names,
+        "\n".join(impls),
+        (data_ptr, kernel_ptr),
+        output_layout=attrs.out_layout,
+    )
+    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
+    return [output]
+
+
+def schedule_qnn_unrolled_depthwise_conv2d(_attrs, _outs, _target):
+    """Schedule function for qnn.depthwise_conv2d."""
+    return None
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index f1065d88937e..640b790073ea 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -142,14 +142,7 @@ def test_infinite_bias_detection(interpreter, layer):
     assert is_depthwise
 
     for i, value in fixed_channels.items():
-        print(i)
         assert np.all(output_data[:, :, :, i] == output_data[0, 0, 0, i])
-        print(output_data[0, 0, 0, i])
-
-    for i in range(out_channels):
-        print(kernel_data.shape)
-        # Skip over output channels with data
-        assert np.any(kernel_data[:, :, :, i])
 
 
 def _get_relu_activation_prefix(layer_num):
@@ -341,16 +334,8 @@ def _make_executor():
     )
 
 
-def _make_parameterization(layer_num):
-    if layer_num == 25:
-        mark = pytest.mark.xfail(reason="unaligned input data ptr", run=False)
-        return pytest.param(layer_num, marks=mark)
-    else:
-        return layer_num
-
-
 @pytest.mark.parametrize("output_layout", ["NHWC", "NCHW"])
-@pytest.mark.parametrize("layer", (_make_parameterization(n) for n in range(27)))
+@pytest.mark.parametrize("layer", range(27))
 @tvm.testing.requires_corstone300
 def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer, output_layout):
     """Checks microTVM output against TFLite for one MobileNetV1 layer.
@@ -360,7 +345,7 @@ def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer, output_layout):
     same structure. The Function is run using microTVM and AOTTestModel, and we verify microTVM's
     output is the same as the TFLite ground truth.
 
-    This function only cross-checks the first 23 layers in MobileNetV1, which are regular and
+    This function only cross-checks the first 27 layers in MobileNetV1, which are regular and
     depthwise 2D convolutions (this function only works for 2D convolutions). We do not test the
     average pool, dense, or softmax layers at the end of the model.
 

From 434f4c5aee92f8dabf630ff68b4cf6b71bba4a54 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Wed, 11 Jan 2023 16:57:42 -0800
Subject: [PATCH 05/19] Use new QNN strategies when applicable

Update strategy to use new schedule when applicable

Functional
---
 python/tvm/relay/qnn/strategy/arm_cpu.py      | 10 ++++----
 python/tvm/testing/aot.py                     |  7 +++++-
 python/tvm/topi/arm_cpu/qnn_alter_op.py       | 24 ++++++++++++++++---
 .../arm_cpu/test_quantized_convolution.py     |  4 ++--
 4 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
index 187f1c46de15..def37f35f7cb 100644
--- a/python/tvm/relay/qnn/strategy/arm_cpu.py
+++ b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -108,12 +108,14 @@ def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
                     topi.arm_cpu.schedule_qnn_depthwise_conv2d,
                     name="qnn_depthwise_conv2d.arm_cpu",
                 )
-            else:
+            elif y_stride == x_stride == 1:
                 strategy.add_implementation(
-                    topi.arm_cpu.qnn_explicit_depthwise_conv2d,
-                    topi.arm_cpu.schedule_qnn_depthwise_conv2d,
-                    name="qnn_depthwise_conv2d.arm_cpu",
+                    topi.arm_cpu.qnn_unrolled_depthwise_conv2d,
+                    topi.arm_cpu.schedule_qnn_unrolled_depthwise_conv2d,
+                    name="qnn_unrolled_depthwise_conv2d.arm_cpu",
                 )
+            else:
+                raise TVMError("No QNN depthwise Conv2D Cortex-M schedule supports these params!")
         else:
             raise TVMError("QNN depthwise Conv2D for Arm Cortex-M DSP got incorrect input layout!")
     else:
diff --git a/python/tvm/testing/aot.py b/python/tvm/testing/aot.py
index 086b2a1bfd90..71686628087d 100644
--- a/python/tvm/testing/aot.py
+++ b/python/tvm/testing/aot.py
@@ -25,6 +25,7 @@
 import logging
 from typing import Any, NamedTuple, Union, Tuple, Optional, List, Dict, Callable
 import numpy as np
+import time
 
 import tvm
 from tvm import relay
@@ -837,6 +838,8 @@ def run_and_check_body(base_path):
             + f" FVP_DIR={fvp_dir}"
             + custom_params
         )
+        print(codegen_path)
+        time.sleep(3)
 
         compile_log_path = os.path.join(build_path, "test_compile.log")
         compile_command = f"{make_command} aot_test_runner"
@@ -860,7 +863,9 @@ def run_and_check_body(base_path):
             _subprocess_check_log_output(run_command, build_path, run_log_path)
 
         with open(run_log_path) as run_log:
-            assert AOT_SUCCESS_TOKEN in run_log.read()
+            log = run_log.read()
+            print(log)
+            assert AOT_SUCCESS_TOKEN in log
 
         return True
 
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
index fc419153173a..a6983440affc 100644
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -20,6 +20,7 @@
 
 from tvm import nd, relay, target
 from ..nn import *
+from ..utils import get_const_tuple
 
 
 def prev_ops_match(curr_op, pattern):
@@ -106,17 +107,34 @@ def alter_conv2d_layout(attrs, inputs, _tinfos, _out_type):
     data_expr, kernel_expr = inputs[:2]
     is_depthwise = attrs.groups > 1
     new_kernel_layout = "IOHW" if is_depthwise else "OHWI"
+    assert attrs.data_layout == "NHWC"
+    padding = get_const_tuple(attrs.padding)
+
+    # For cortex-m devices, it does not make sense to do bounds checking for conv2d at
+    # runtime, and it is more efficient to copy the tensor to a new buffer. If we did
+    # this inside our conv2d, it
+    # We should handle padding separately from convolution, so the original tensor can be
+    # de-allocated immediately. This may also help with fusing padding onto a previous
+    # operator.
+
+    if any(attrs.padding):
+        data_expr = relay.nn.pad(data_expr, ((0, 0), (0, 1), (0, 1), (0, 0)), -128)
 
     op = relay.qnn.op.conv2d(
         relay.cast(data_expr, dtype="int16"),
         relay.cast(kernel_expr, dtype="int16"),
         *inputs[2:],
-        **edit_attrs(attrs, kernel_layout=new_kernel_layout, out_layout="NHWC"),
+        **edit_attrs(
+            attrs,
+            padding=(0, 0, 0, 0),
+            kernel_layout=new_kernel_layout,
+            out_layout="NHWC"
+        ),
     )
 
     # If possible, modify depthwise ops to take as input NCHW instead.
-    if is_depthwise and prev_ops_match(op.args[0], ("cast", "qnn.requantize", "add", "qnn.conv2d")):
-        op = _alter_depthwise_conv2d_layout(op)
+    #if is_depthwise and prev_ops_match(op.args[0], ("cast", "qnn.requantize", "add", "qnn.conv2d")):
+    #    op = _alter_depthwise_conv2d_layout(op)
 
     return op
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index 640b790073ea..fcd4f9db171d 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -256,7 +256,7 @@ def _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams, is_depthwise=
         kernel_layout="IOHW" if is_depthwise else kernel_layout,
         dilation=(1, 1),
         strides=strides,
-        padding=(0, 0, 0, 0),
+        padding=padding,
         groups=groups,
         channels=channels,
         out_dtype="int32",
@@ -295,7 +295,7 @@ def _make_aot_model(params, hyperparams, layouts, is_depthwise=False):
     dtype, padding, _strides = hyperparams
     data_layout, _, output_layout = layouts
 
-    if any(padding):
+    if False: #any(padding):
         pad_const = int(data_quant["zero_points"][0])
         pad_before = (0, padding[0], padding[1], 0)
         pad_after = (0, padding[2], padding[3], 0)

From ca7a0e0b8fd5750ebe5ce9733543c1c45711fd91 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Mon, 23 Jan 2023 19:19:27 -0800
Subject: [PATCH 06/19] Break padding into separate operator

Break padding into separate operator

Final bugfixes for working VWW model

Replace TFLite clipping hack with legalize step

Formatting and linting

Fix C random gen

Reorder C++ imports
---
 python/tvm/relay/frontend/tflite.py           |  3 +-
 python/tvm/relay/op/nn/_nn.py                 | 39 ----------
 python/tvm/relay/qnn/op/_qnn.py               | 72 ++++++++++++++++++-
 python/tvm/relay/qnn/strategy/arm_cpu.py      | 19 +++--
 python/tvm/testing/aot.py                     |  7 +-
 python/tvm/topi/arm_cpu/qnn.py                | 36 +++-------
 python/tvm/topi/arm_cpu/qnn_alter_op.py       | 49 ++++++-------
 python/tvm/topi/arm_cpu/qnn_legalize.py       |  6 +-
 python/tvm/topi/nn/qnn.py                     | 22 ++++++
 src/target/source/codegen_c.cc                |  6 +-
 .../arm_cpu/test_quantized_convolution.py     |  8 +--
 11 files changed, 148 insertions(+), 119 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 19f6887e17fd..db21fa6668d1 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -558,8 +558,7 @@ def convert_qnn_fused_activation_function(
         if fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
             return _op.clip(expr, a_min=max(qmin, quantize(-1.0)), a_max=min(qmax, quantize(1.0)))
         if fused_activation_fn == ActivationFunctionType.RELU:
-            return expr
-            #return _op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
+            return _op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
 
         fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
         raise tvm.error.OpNotImplemented(
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 404ad826cc6d..53aec11e5816 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -877,45 +877,6 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
     return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
 
 
-# QNN ops
-@reg.register_legalize("nn.bias_add")
-def legalize_bias_add(attrs, inputs, tinfos):
-    """Legalize a bias add operator.
-
-    May be used to "fold in" unused channels from quantized convolution operators. This should
-    be done before layout rewrites occur to minimize the amount of "extra" overhead operators
-    like "cast" and "layout_transform".
-    """
-    return topi.nn.qnn.qnn_bias_add_legalize(attrs, inputs, tinfos)
-
-
-@reg.register_alter_op_layout("qnn.conv2d")
-def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a qnn conv2d op.
-
-    May be used to alter the current QNN Conv2D op, but can also be used to alter previous ops to
-    better match the current op. For example, Arm Cortex-M uses this to set the out_layout of
-    previous ops to the input layout preferred by future layouts.
-    """
-    return topi.nn.qnn.qnn_conv2d_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@reg.register_alter_op_layout("add")
-def alter_op_layout_add(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a add op.
-
-    Useful for fusing the bias constant with an input zero point constant in a previous quantized
-    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
-    """
-    return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@reg.register_alter_op_layout("qnn.requantize")
-def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a requantization op."""
-    return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
-
-
 # bitpack
 @reg.register_compute("nn.bitpack")
 def compute_bitpack(attrs, inputs, out_dtype):
diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index e2157a051abb..60dc6fdc6eae 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -17,12 +17,18 @@
 # pylint: disable=invalid-name, unused-argument, len-as-condition
 """QNN operator feature registration"""
 
-from tvm import topi
+from tvm import topi, TVMError
 
 from .. import strategy
 from ...op.op import register_compute
 from ...op.op import register_injective_schedule
-from ...op.op import register_strategy, register_pattern, register_alter_op_layout, OpPattern
+from ...op.op import (
+    OpPattern,
+    register_alter_op_layout,
+    register_legalize,
+    register_pattern,
+    register_strategy,
+)
 
 
 @register_compute("qnn.simulated_quantize")
@@ -85,12 +91,72 @@ def simulated_dequantize_compute(attrs, inputs, output_type):
 register_strategy("qnn.conv2d", strategy.qnn_conv2d_strategy)
 
 
+def _get_clip_dtype_bounds(dtype):
+    """Returns the minimum and maximum values of a C integer data type."""
+    assert "int" in dtype
+    bits = int(dtype[dtype.find("int") + 3 :])
+
+    if dtype.startswith("int"):
+        return (-(2 ** (bits - 1)), 2 ** (bits - 1) - 1)
+    elif dtype.startswith("uint"):
+        return (0, 2**bits - 1)
+    else:
+        raise TVMError(f"Clip legalization is not supported for data type '{dtype}'!")
+
+
+@register_legalize("clip")
+def legalize_clip(attrs, inputs, tinfos):
+    """Removes clip operators with bounds matching the defaults for their dtype.
+
+    This is already done after alter_op by TVM's simplification passes, but certain QNN operator
+    implementations (like Cortex-M) need it to be done earlier in legalization.
+    """
+
+    if hasattr(inputs[0], "op") and inputs[0].op.name == "qnn.requantize":
+        if _get_clip_dtype_bounds(tinfos[0].dtype) == (attrs.a_min, attrs.a_max):
+            return inputs[0]
+
+    return None
+
+
+@register_legalize("nn.bias_add")
+def legalize_bias_add(attrs, inputs, tinfos):
+    """Legalize a bias add operator.
+
+    May be used to "fold in" unused channels from quantized convolution operators. This should
+    be done before layout rewrites occur to minimize the amount of "extra" overhead operators
+    like "cast" and "layout_transform".
+    """
+    return topi.nn.qnn_bias_add_legalize(attrs, inputs, tinfos)
+
+
 @register_alter_op_layout("qnn.conv2d")
 def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of qnn.conv2d"""
+    """Alter the layout of a qnn conv2d op.
+
+    May be used to alter the current QNN Conv2D op, but can also be used to alter previous ops to
+    better match the current op. For example, Arm Cortex-M uses this to set the out_layout of
+    previous ops to the input layout preferred by future layouts.
+    """
     return topi.nn.qnn_conv2d_alter_layout(attrs, inputs, tinfos, out_type)
 
 
+@register_alter_op_layout("add")
+def alter_op_layout_add(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a add op.
+
+    Useful for fusing the bias constant with an input zero point constant in a previous quantized
+    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
+    """
+    return topi.nn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
+
+
+@register_alter_op_layout("qnn.requantize")
+def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a requantization op."""
+    return topi.nn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
+
+
 # qnn.dense
 register_strategy("qnn.dense", strategy.qnn_dense_strategy)
 
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
index def37f35f7cb..bddfd7de3a56 100644
--- a/python/tvm/relay/qnn/strategy/arm_cpu.py
+++ b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -21,14 +21,23 @@
 regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
 
 from tvm import topi, TVMError
-from .generic import *
+from tvm.topi.utils import get_const_tuple
 from ... import op as _op
 from ...op.strategy.generic import is_depthwise_conv2d
-from tvm.topi.utils import get_const_tuple
+from .generic import (
+    qnn_conv2d_strategy,
+    qnn_dense_strategy,
+    qnn_dequantize_strategy,
+    qnn_quantize_strategy,
+    wrap_compute_dequantize,
+    wrap_compute_quantize,
+    wrap_topi_qnn_dense,
+    wrap_topi_schedule,
+)
 
 
 @qnn_quantize_strategy.register("arm_cpu")
-def qnn_quantize_strategy_arm_cpu(attrs, inputs, out_type, target):
+def qnn_quantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
     """qnn.quantize strategy for arm_cpu"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
@@ -40,7 +49,7 @@ def qnn_quantize_strategy_arm_cpu(attrs, inputs, out_type, target):
 
 
 @qnn_dequantize_strategy.register("arm_cpu")
-def qnn_dequantize_strategy_arm_cpu(attrs, inputs, out_type, target):
+def qnn_dequantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
     """qnn.dequantize strategy for arm_cpu"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
@@ -52,7 +61,7 @@ def qnn_dequantize_strategy_arm_cpu(attrs, inputs, out_type, target):
 
 
 @qnn_dense_strategy.register("arm_cpu")
-def qnn_dense_strategy_arm_cpu(attrs, inputs, out_type, target):
+def qnn_dense_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
     """qnn.dense strategy for arm_cpu"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
diff --git a/python/tvm/testing/aot.py b/python/tvm/testing/aot.py
index 71686628087d..086b2a1bfd90 100644
--- a/python/tvm/testing/aot.py
+++ b/python/tvm/testing/aot.py
@@ -25,7 +25,6 @@
 import logging
 from typing import Any, NamedTuple, Union, Tuple, Optional, List, Dict, Callable
 import numpy as np
-import time
 
 import tvm
 from tvm import relay
@@ -838,8 +837,6 @@ def run_and_check_body(base_path):
             + f" FVP_DIR={fvp_dir}"
             + custom_params
         )
-        print(codegen_path)
-        time.sleep(3)
 
         compile_log_path = os.path.join(build_path, "test_compile.log")
         compile_command = f"{make_command} aot_test_runner"
@@ -863,9 +860,7 @@ def run_and_check_body(base_path):
             _subprocess_check_log_output(run_command, build_path, run_log_path)
 
         with open(run_log_path) as run_log:
-            log = run_log.read()
-            print(log)
-            assert AOT_SUCCESS_TOKEN in log
+            assert AOT_SUCCESS_TOKEN in run_log.read()
 
         return True
 
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
index 84a907ca3d7a..c967fbecce7c 100644
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -17,7 +17,8 @@
 """Contains TVMScript implementations of some QNN operators for Arm.
 
 Currently, the only ops with compute functions are fused regular and depthwise convolutions for
-Arm Cortex-M with DSP.
+Arm Cortex-M with DSP. Additionally, these functions explicitly do not support padding - it
+must be done in a separate Relay op for memory reasons.
 """
 
 from typing import Callable, Dict, Tuple
@@ -26,7 +27,6 @@
 from tvm import te, TVMError
 from tvm.script import tir as T
 from tvm.tir import const
-from tvm.topi.nn.pad import pad
 
 from ..utils import get_const_tuple
 from .mprofile.dsp.micro_kernel import tensordot
@@ -122,25 +122,6 @@ def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False):
     return (aligned_func, offset_func)
 
 
-def _optionally_pad_data(data, layout, padding, pad_value=-128):
-    # rq_input_zero_point_const = inputs[8].op.body[0]
-
-    padding = get_const_tuple(padding)
-    if any(padding):
-        pad_before = [0, 0, 0, 0]
-        pad_after = [0, 0, 0, 0]
-
-        pad_up, pad_left, pad_down, pad_right = padding
-        pad_before[layout.index("H")] = pad_up
-        pad_after[layout.index("H")] = pad_down
-        pad_before[layout.index("W")] = pad_left
-        pad_after[layout.index("W")] = pad_right
-        return pad(data, pad_before, pad_after, pad_value)
-
-    else:
-        return data
-
-
 def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
     return T.tvm_access_ptr(
         T.type_annotation(dtype=dtype),
@@ -280,11 +261,10 @@ def qnn_conv2d(attrs, inputs, out_type):
     # Make a few checks to unpack the function arguments and ensure it was called with the right
     # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper.
     assert len(inputs) == 11
-    assert attrs
+    assert not any(get_const_tuple(attrs.padding))
 
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    padded_data = _optionally_pad_data(data, attrs.data_layout, attrs.padding)
-    _, height, width, in_channels = get_const_tuple(padded_data.shape)
+    _, height, width, in_channels = get_const_tuple(data.shape)
     out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
 
     y_stride, x_stride = get_const_tuple(attrs.strides)
@@ -358,9 +338,9 @@ def qnn_depthwise_conv2d(attrs, inputs, out_type):
     """
 
     assert len(inputs) == 11
+    assert not any(get_const_tuple(attrs.padding))
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    padded_data = _optionally_pad_data(data, attrs.data_layout, attrs.padding)
-    _, _, height, width = get_const_tuple(padded_data.shape)
+    _, _, height, width = get_const_tuple(data.shape)
     _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
 
     y_stride, x_stride = get_const_tuple(attrs.strides)
@@ -492,12 +472,12 @@ def qnn_unrolled_depthwise_conv2d(attrs, inputs, out_type):
     """
 
     assert len(inputs) == 11
+    assert not any(get_const_tuple(attrs.padding))
     y_stride, x_stride = get_const_tuple(attrs.strides)
     assert y_stride == x_stride == 1
 
     data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
-    padded_data = _optionally_pad_data(data, attrs.data_layout, attrs.padding)
-    _, _, height, width = get_const_tuple(padded_data.shape)
+    _, _, height, width = get_const_tuple(data.shape)
     _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
 
     y_stride, x_stride = get_const_tuple(attrs.strides)
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
index a6983440affc..3259c26b44f5 100644
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -19,8 +19,8 @@
 import numpy as np
 
 from tvm import nd, relay, target
-from ..nn import *
 from ..utils import get_const_tuple
+from ..nn import qnn_conv2d_alter_layout, qnn_add_alter_layout, qnn_requantize_alter_layout
 
 
 def prev_ops_match(curr_op, pattern):
@@ -57,7 +57,7 @@ def _squash_transformations(expr):
         return prev_kernel.astype(attrs.dtype)
     elif kernel.op.name == "expand_dims":
         new_axes = range(attrs.axis, attrs.axis + attrs.num_newaxis)
-        return np.expand_dims(prev_kernel, tuple(axes))
+        return np.expand_dims(prev_kernel, tuple(new_axes))
     else:
         raise RuntimeError(f"Invalid kernel transformation '{expr}'!")
 
@@ -107,34 +107,17 @@ def alter_conv2d_layout(attrs, inputs, _tinfos, _out_type):
     data_expr, kernel_expr = inputs[:2]
     is_depthwise = attrs.groups > 1
     new_kernel_layout = "IOHW" if is_depthwise else "OHWI"
-    assert attrs.data_layout == "NHWC"
-    padding = get_const_tuple(attrs.padding)
-
-    # For cortex-m devices, it does not make sense to do bounds checking for conv2d at
-    # runtime, and it is more efficient to copy the tensor to a new buffer. If we did
-    # this inside our conv2d, it
-    # We should handle padding separately from convolution, so the original tensor can be
-    # de-allocated immediately. This may also help with fusing padding onto a previous
-    # operator.
-
-    if any(attrs.padding):
-        data_expr = relay.nn.pad(data_expr, ((0, 0), (0, 1), (0, 1), (0, 0)), -128)
 
     op = relay.qnn.op.conv2d(
         relay.cast(data_expr, dtype="int16"),
         relay.cast(kernel_expr, dtype="int16"),
         *inputs[2:],
-        **edit_attrs(
-            attrs,
-            padding=(0, 0, 0, 0),
-            kernel_layout=new_kernel_layout,
-            out_layout="NHWC"
-        ),
+        **edit_attrs(attrs, kernel_layout=new_kernel_layout, out_layout="NHWC"),
     )
 
     # If possible, modify depthwise ops to take as input NCHW instead.
-    #if is_depthwise and prev_ops_match(op.args[0], ("cast", "qnn.requantize", "add", "qnn.conv2d")):
-    #    op = _alter_depthwise_conv2d_layout(op)
+    if is_depthwise and prev_ops_match(op.args[0], ("cast", "qnn.requantize", "add", "qnn.conv2d")):
+        op = _alter_depthwise_conv2d_layout(op)
 
     return op
 
@@ -187,11 +170,29 @@ def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
     if new_biases.min() < -(2**31) or new_biases.max() > 2**31 - 1:
         return None
 
+    current_target = target.Target.current(allow_none=False)
     new_input_zp = relay.Constant(nd.array(np.int32(0)))
-    new_conv_args = (*prev_op.args[:2], new_input_zp, *prev_op.args[3:])
-    new_conv_op = relay.qnn.op.conv2d(*new_conv_args, **prev_op.attrs)
+    new_conv_args = [*prev_op.args[:2], new_input_zp, *prev_op.args[3:]]
     bias_constant = relay.Constant(nd.array(new_biases.astype("int32")))
 
+    # We should handle padding separately from convolution, so the original tensor can be
+    # de-allocated immediately. This may also help with fusing padding onto a previous
+    # operator. However, only do this if we're working with Cortex-M devices.
+    padding = get_const_tuple(prev_op.attrs.padding)
+    if "cortex-m" in current_target.mcpu and any(padding):
+        data_layout = prev_op.attrs.data_layout
+        assert data_layout.isupper()
+
+        pad_up, pad_left, pad_down, pad_right = padding
+        pad_op_arg = [(0, 0)] * len(data_layout)
+        pad_op_arg[data_layout.index("H")] = (pad_up, pad_down)
+        pad_op_arg[data_layout.index("W")] = (pad_left, pad_right)
+        new_conv_args[0] = relay.nn.pad(new_conv_args[0], tuple(pad_op_arg), conv_input_zp)
+
+    new_conv_op = relay.qnn.op.conv2d(
+        *new_conv_args,
+        **edit_attrs(prev_op.attrs, padding=(0, 0, 0, 0)),
+    )
     # If biases was wrapped in an expand_dims op, we must re-wrap it
     if isinstance(biases_data_op, relay.expr.Call):
         new_biases_op = relay.expand_dims(bias_constant, **biases_data_op.attrs)
diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
index 447b86f464fa..24fdbd3367e7 100644
--- a/python/tvm/topi/arm_cpu/qnn_legalize.py
+++ b/python/tvm/topi/arm_cpu/qnn_legalize.py
@@ -29,11 +29,12 @@
 as there is no downside. This may be possible with Relax, but I'm unsure.
 """
 
+import numpy as np
 from scipy.signal import convolve2d
 from tvm.topi.utils import get_const_tuple
-from tvm import nd
+from tvm import nd, relay
 from .qnn_alter_op import prev_ops_match, edit_attrs
-from ..nn import *
+from ..nn import qnn_bias_add_legalize
 
 
 def _compute_fixed_conv2d_outputs(requantize_op):
@@ -82,7 +83,6 @@ def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
     bias_data = bias_add_op.args[1].data.numpy()
 
     kernel_size = get_const_tuple(depthwise_op.attrs.kernel_size)
-    pad_top, pad_left, pad_bottom, pad_right = get_const_tuple(depthwise_op.attrs.padding)
 
     # Make a kernel_size x kernel_size array of fixed_input
     # Pad it with zeros usint padding
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
index 1f624f3b3466..1b583ae8069e 100644
--- a/python/tvm/topi/nn/qnn.py
+++ b/python/tvm/topi/nn/qnn.py
@@ -232,6 +232,28 @@ def qnn_bias_add_legalize(_attrs, _inputs, _tinfos):
     return None
 
 
+@tvm.target.generic_func
+def qnn_clip_legalize(_attrs, inputs, _tinfos, _out_type):
+    """Change clip layout.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
+    """
+    return inputs[0]
+
+
 @tvm.target.generic_func
 def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
     """Change add layout.
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 523bb51edf71..b198659d3639 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -25,8 +25,8 @@
 #include <tvm/arith/analyzer.h>
 
 #include <cctype>
-#include <iomanip>
 #include <cstdlib>
+#include <iomanip>
 
 #include "../../arith/pattern_match.h"
 #include "codegen_params.h"
@@ -632,8 +632,10 @@ void CodeGenC::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs,
   }
 }
 
+unsigned int random_seed = 0;
 void CodeGenC::VisitStmt_(const AllocateConstNode* op) {
-  int suffix = rand() % (2 << 24);
+  // Add a random suffix to eliminate duplicate global variables.
+  int suffix = rand_r(&random_seed) % (2 << 24);
   std::string symbol_name = op->buffer_var->name_hint + "_" + std::to_string(suffix);
   int64_t num_elements = 1;
   const auto& data = op->data.value();
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index fcd4f9db171d..0a6515e5b419 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -292,14 +292,8 @@ def _make_aot_model(params, hyperparams, layouts, is_depthwise=False):
     data, kernel, bias, output = tensors
     data_quant, kernel_quant, bias_quant, output_quant = quantizations
 
-    dtype, padding, _strides = hyperparams
+    dtype, _padding, _strides = hyperparams
     data_layout, _, output_layout = layouts
-
-    if False: #any(padding):
-        pad_const = int(data_quant["zero_points"][0])
-        pad_before = (0, padding[0], padding[1], 0)
-        pad_after = (0, padding[2], padding[3], 0)
-        data = np.pad(data, tuple(zip(pad_before, pad_after)), constant_values=pad_const)
     data_ndarr = _change_layout(data, "NHWC", data_layout, dtype)
     output_ndarr = _change_layout(output, "NHWC", output_layout, dtype)
 

From 24c502900ca21e489336441cb0ec729bddba5631 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Thu, 26 Jan 2023 11:44:54 -0800
Subject: [PATCH 07/19] Address Alan's comments

---
 python/tvm/topi/arm_cpu/qnn.py                | 25 +++++++++++--------
 python/tvm/topi/arm_cpu/qnn_legalize.py       | 25 +++++++++++++------
 .../arm_cpu/test_quantized_convolution.py     | 16 +++++-------
 3 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
index c967fbecce7c..f0aaca266f40 100644
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -32,12 +32,12 @@
 from .mprofile.dsp.micro_kernel import tensordot
 
 
-def int_ceil_division(x, y):
+def _int_ceil_division(x, y):
     return -(x // -y)
 
 
 def _compute_output_dim(data_length, kernel_length, stride):
-    return int_ceil_division(data_length + 1 - kernel_length, stride)
+    return _int_ceil_division(data_length + 1 - kernel_length, stride)
 
 
 def _pick_num_outputs(out_width):
@@ -273,8 +273,8 @@ def qnn_conv2d(attrs, inputs, out_type):
 
     # Decide how many sums our function should have running at the same time. Doing
     # this lets us do "more work" for each memory load, but doing too many of them causes us to run
-    # out of registers. Currently this is set to either 1 or 2, but autotuning this value would
-    # improve performance a lot.
+    # out of registers. Currently this is set to the smallest value greater than one that divides
+    # the output width, but autotuning this value would improve performance a lot.
     num_outputs = _pick_num_outputs(out_width)
 
     # Next, decide whether whether we need "parity alternation". For example, if we have an
@@ -413,7 +413,7 @@ def output_ptr(output, y, c):
         else:
             raise TVMError(f"Unsupported out_layout '{output_layout}'!")
 
-    def _make_row_call(buffers, c_var, y, c):
+    def make_row_call(buffers, c_var, y, c):
         output, data, kernel, bias, scale = buffers
         return _make_tscript_call(
             function_names[(y + c) % 2, c % 2, 0],
@@ -452,12 +452,15 @@ def biased_quantized_conv2d(
             with T.block("conv2ds"):
                 T.block_attr({"pragma_import_c": function_code})
                 c = T.axis.remap("S", [c_ax]) * 2
-                _make_row_call((output, data, kernel, bias, scale), c, 0, 0)
-                _make_row_call((output, data, kernel, bias, scale), c, 1, 0)
-                _make_row_call((output, data, kernel, bias, scale), c, 2, 0)
-                _make_row_call((output, data, kernel, bias, scale), c, 0, 1)
-                _make_row_call((output, data, kernel, bias, scale), c, 1, 1)
-                _make_row_call((output, data, kernel, bias, scale), c, 2, 1)
+
+                # TODO how can I programatically make the right number of
+                # function calls?
+                make_row_call((output, data, kernel, bias, scale), c, 0, 0)
+                make_row_call((output, data, kernel, bias, scale), c, 1, 0)
+                make_row_call((output, data, kernel, bias, scale), c, 2, 0)
+                make_row_call((output, data, kernel, bias, scale), c, 0, 1)
+                make_row_call((output, data, kernel, bias, scale), c, 1, 1)
+                make_row_call((output, data, kernel, bias, scale), c, 2, 1)
 
     return biased_quantized_conv2d
 
diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
index 24fdbd3367e7..9e1298ab7bf6 100644
--- a/python/tvm/topi/arm_cpu/qnn_legalize.py
+++ b/python/tvm/topi/arm_cpu/qnn_legalize.py
@@ -67,7 +67,14 @@ def _compute_fixed_conv2d_outputs(requantize_op):
 
 
 def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
-    """Compute all depthwise conv2d output values that do not depend on the PREVIOUS layer input."""
+    """Compute all depthwise conv2d output values that do not depend on the PREVIOUS layer input.
+
+    We take as input a requantize operator, and a dictionary of which inputs to our depthwise
+    operator are fixed and what values they are fixed to. However, a fixed input to one channel
+    of our depthwise operator does NOT guarantee we can remove the output, because of padding.
+    This function checks if the padding makes a difference in the outputs, and if not, removes
+    the channels from the depthwise_conv2d.
+    """
     bias_add_op = requantize_op.args[0]
     depthwise_op = bias_add_op.args[0]
 
@@ -99,16 +106,18 @@ def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
         rounded = np.around((convolved + bias_data[i]) * scale).astype("int32")
         clipped = np.clip(rounded + rq_output_zero_point, -128, 127)
 
-        # TODO - this is kinda paranoid and I think it removes too many. Make sure this is accurate.
-        # This bug likely causes a ~5% performance reduction
+        # We require the ENTIRE padded convolution to all have the same clipped value before we do
+        # a replacement. This is excessive - we only have to check for the padding that will
+        # actually be performed on the depthwise convolution, which is often less. If we felt even
+        # more ambitious, we could do the replacement for "close enough" looking convolution
+        # outputs, which in theory could reduce accuracy but in practice does not. Doing this would
+        # yield a ~0.5% speed gain on MobileNetV1, and nothing on other models.
+
         if np.all(clipped == clipped[0, 0]):
             fixed_outputs[i] = clipped[0, 0]
-        else:
-            # TODO remove
-            assert True
 
-    # TODO look for all-zero entries in the depthwise kernel. I don't think these
-    # really occur in practice, but it would be nice for theoretical completeness.
+    # TODO look for all-zero entries in the depthwise kernel. I don't think these really occur in
+    # practice, but it would be nice for theoretical completeness.
 
     return fixed_outputs
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index 0a6515e5b419..4883ccd2392f 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -96,14 +96,14 @@ def _get_mobilenet_v1_layer_attributes(layer_num):
 
 
 @pytest.mark.parametrize("layer", range(2, 27, 2))
-def test_infinite_bias_detection(interpreter, layer):
+def test_empty_channel_detection(interpreter, layer):
     """Some models (mainly MobileNetV1) have kernels with many output channels full entirely of
-    zeroes. The VWW mdoel is one of these. This test confirms that the outputs of these channels,
+    zeroes. The VWW model is one of these. This test confirms that the outputs of these channels,
     as computed by TensorFlow, are indeed not dependent upon the input values.
     """
 
     _, kernel, bias, output = _load_tflite_layer(interpreter, layer)
-    kernel_data, kernel_quant = kernel
+    kernel_data, _ = kernel
     bias_data, bias_quant = bias
     output_data, output_quant = output
     is_depthwise = _get_mobilenet_v1_layer_attributes(layer)[2]
@@ -128,21 +128,17 @@ def test_infinite_bias_detection(interpreter, layer):
         out_channel_values = output_data[0, :, :, i].flatten()
         assert all(x == clipped for x in out_channel_values)
         fixed_channels[i] = clipped
-    print(f"Layer {layer} had {len(fixed_channels)}/{out_channels} empty!")
 
     # We now need to compute values for the following depthwise layer
     if layer == 26:
         return
 
-    _, kernel, bias, output = _load_tflite_layer(interpreter, layer + 1)
-    kernel_data, kernel_quant = kernel
-    bias_data, bias_quant = bias
-    output_data, output_quant = output
+    depthwise_output = _load_tflite_layer(interpreter, layer + 1)[3][0]
     is_depthwise = _get_mobilenet_v1_layer_attributes(layer + 1)[2]
     assert is_depthwise
 
-    for i, value in fixed_channels.items():
-        assert np.all(output_data[:, :, :, i] == output_data[0, 0, 0, i])
+    for i in fixed_channels:
+        assert np.all(depthwise_output[:, :, :, i] == depthwise_output[0, 0, 0, i])
 
 
 def _get_relu_activation_prefix(layer_num):

From d9a3659dccbab4f401725fe1f1926e03e3d14e64 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Mon, 30 Jan 2023 20:48:58 -0800
Subject: [PATCH 08/19] Get tests passing via small performance sacrifice

Get tests passing with small performance sacrifice

Fix linting and axis_semantic_change
---
 include/tvm/relay/transform.h                 |  7 ++++++
 src/relay/backend/utils.cc                    |  4 ++--
 src/relay/transforms/simplify_expr.cc         | 22 +++++++++++++++++++
 .../python/test_topi_conv2d_tensordot_opts.py |  4 ++--
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 43a0f89d95c1..5f591f1d89ad 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -467,6 +467,13 @@ TVM_DLL Pass RemoveUnusedFunctions(Array<runtime::String> entry_functions);
  */
 TVM_DLL Pass SimplifyExpr();
 
+/*!
+ * \brief Stripped down version of SimplifyExpr which is run after AlterOpLayout.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass SimplifyExprPostAlterOp();
+
 /*!
  * \brief Run any custom passes registered under "RelayToTIR" attributes on TargetKinds.
  *
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 8421721b2777..f009bda9cd98 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -274,10 +274,10 @@ Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm) {
       pass_seqs.push_back(transform::InferType());
     }
     pass_seqs.push_back(transform::AlterOpLayout());
+    pass_seqs.push_back(transform::SimplifyExprPostAlterOp());
   }
 
   // Fast math optimizations.
-  pass_seqs.push_back(transform::SimplifyExpr());
   pass_seqs.push_back(transform::FastMath());
   pass_seqs.push_back(transform::FoldConstant());
 
@@ -417,7 +417,7 @@ Optional<tir::PrimFunc> DefaultTIRConverterImpl(const Array<te::Tensor>& args,
       return NullOpt;
     }
   }
-  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants, DataType::Int(32));
+  PrimFunc func = te::CreatePrimFuncWithConstants(args, constants, DataType::Int(64));
   bool dynamic_loop_extent = false;
   tir::PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void {
     if (const auto* loop = obj.as<tir::ForNode>()) {
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index c64957b5b62a..3e912a204eda 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -969,6 +969,19 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) {
   return RewritePatterns(composer.MakeCallbacks(), expr, mod);
 }
 
+Expr SimplifyExprPostAlterOp(const Expr& expr, const IRModule& mod) {
+  // stripped-down version of AlterOp that cleans up some patterns
+  // often left by the AlterOpLayout pass.
+  DFPatternRewriteComposer composer;
+  composer.AddRewrite<EliminateIdentityRewrite>();
+  composer.AddRewrite<SimplifyReshape>();
+  composer.AddRewrite<SimplifySameCast>();
+  composer.AddRewrite<SimplifyConsecutiveCast>();
+  composer.AddRewrite<SimplifyClipAndConsecutiveCast>();
+  composer.AddRewrite<SimplifyCastClip>();
+  return RewritePatterns(composer.MakeCallbacks(), expr, mod);
+}
+
 namespace transform {
 
 Pass SimplifyExpr() {
@@ -979,7 +992,16 @@ Pass SimplifyExpr() {
   return CreateFunctionPass(pass_func, 0, "SimplifyExpr", {"InferType"});
 }
 
+Pass SimplifyExprPostAlterOp() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(SimplifyExprPostAlterOp(f, m));
+      };
+  return CreateFunctionPass(pass_func, 0, "SimplifyExprPostAlterOp", {"InferType"});
+}
+
 TVM_REGISTER_GLOBAL("relay._transform.SimplifyExpr").set_body_typed(SimplifyExpr);
+// Don't globally register SimplifyExprPostAlterOp
 
 }  // namespace transform
 
diff --git a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
index fc0b556e9d10..7bea7577b6bf 100644
--- a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
+++ b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
@@ -309,9 +309,9 @@ def test_1x1x8_convolution_code():
       requant_3 = __ssat(requant_3 + -128, 8);
 
       int packed_res_0;
-      asm inline ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_0) : "r" (requant_0), "r" (requant_1));
+      __asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_0) : "r" (requant_0), "r" (requant_1));
       int packed_res_1;
-      asm inline ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_1) : "r" (requant_2), "r" (requant_3));
+      __asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_1) : "r" (requant_2), "r" (requant_3));
       output[0] = packed_res_0;
       output[1] = packed_res_1;
       return 0;

From 85344cffcf4e2b61b03e63a6a84e3586e0bc5fbd Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Wed, 1 Feb 2023 08:36:53 -0800
Subject: [PATCH 09/19] Only run empty channel detection test with tensorflow

---
 .../python/relay/strategy/arm_cpu/test_quantized_convolution.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index 4883ccd2392f..bbf7003e61aa 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -96,6 +96,7 @@ def _get_mobilenet_v1_layer_attributes(layer_num):
 
 
 @pytest.mark.parametrize("layer", range(2, 27, 2))
+@tvm.testing.requires_package("tensorflow")
 def test_empty_channel_detection(interpreter, layer):
     """Some models (mainly MobileNetV1) have kernels with many output channels full entirely of
     zeroes. The VWW model is one of these. This test confirms that the outputs of these channels,

From c5507d3c20ea971ca7315c0e7792aa49347cd9a7 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Fri, 3 Feb 2023 11:51:29 -0800
Subject: [PATCH 10/19] Use good strategy for name de-duplication

---
 src/target/source/codegen_c.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index b198659d3639..7553e0aa1570 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -23,9 +23,9 @@
 #include "codegen_c.h"
 
 #include <tvm/arith/analyzer.h>
+#include <tvm/ir/name_supply.h>
 
 #include <cctype>
-#include <cstdlib>
 #include <iomanip>
 
 #include "../../arith/pattern_match.h"
@@ -632,11 +632,10 @@ void CodeGenC::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs,
   }
 }
 
-unsigned int random_seed = 0;
+NameSupply global_name_supply = NameSupply("");
 void CodeGenC::VisitStmt_(const AllocateConstNode* op) {
-  // Add a random suffix to eliminate duplicate global variables.
-  int suffix = rand_r(&random_seed) % (2 << 24);
-  std::string symbol_name = op->buffer_var->name_hint + "_" + std::to_string(suffix);
+  std::string symbol_name = global_name_supply->FreshName(op->buffer_var->name_hint);
+
   int64_t num_elements = 1;
   const auto& data = op->data.value();
 

From 3b33ae389a604b5d50343d38a4f577b7c89f2f25 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Thu, 9 Feb 2023 21:53:49 -0800
Subject: [PATCH 11/19] Address first round of code review comments

---
 python/tvm/topi/arm_cpu/qnn.py                | 113 +++++--
 python/tvm/topi/arm_cpu/qnn_alter_op.py       |  10 +-
 python/tvm/topi/arm_cpu/qnn_legalize.py       |  13 +-
 src/relay/transforms/simplify_expr.cc         |   3 +-
 .../relay/qnn/test_qnn_channel_stripping.py   | 284 ++++++++++++++++++
 .../arm_cpu/test_quantized_convolution.py     |   6 +-
 6 files changed, 391 insertions(+), 38 deletions(-)
 create mode 100644 tests/python/relay/qnn/test_qnn_channel_stripping.py

diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
index f0aaca266f40..bfd37847f3e0 100644
--- a/python/tvm/topi/arm_cpu/qnn.py
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -24,7 +24,7 @@
 from typing import Callable, Dict, Tuple
 
 import tvm
-from tvm import te, TVMError
+from tvm import te, tir, TVMError
 from tvm.script import tir as T
 from tvm.tir import const
 
@@ -147,12 +147,44 @@ def _make_tscript_call(func_name, *args):
 
 def _make_conv2d_primfunc(
     output_dimensions: Tuple[int, int, int, int],
-    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
+    buffer_shapes: Tuple,
     aligned_func: Tuple[str, str],
     offset_func: Tuple[str, str],
     ptr_gens: Tuple[Callable, Callable],
-    output_layout="NHWC",
-):
+    output_layout: str = "NHWC",
+) -> tir.function.PrimFunc:
+    """Makes a TIR PrimFunc computing Conv2D using a call to tensordot.
+
+    Can be used to generate regular, depthwise, and grouped Conv2D operators by passing different
+    arguments and ptr_gen functions. However, it only works for Conv2D operators where the height
+    stride of the tensor is divisible by two.
+
+    Parameters
+    ----------
+    output_dimensions : Tuple[int, int, int, int]
+        A tuple containing the out_height, out_width, out_channels, and desired num_outputs values
+        in that order.
+
+    buffer_shapes: Tuple[tvm.ir.container.Array]
+        The shapes of the data, kernel, bias, scale, and output tensors, in that order. Each shape
+        should be a TVM Array.
+
+    aligned_func: Tuple[str, str]
+        A tuple containing the (name, C implementation) of a word-aligned tensordot operator.
+
+    offset_func: Tuple[str, str]
+        A tuple containing the (name, C implementation) of a word-unaligned tensordot operator. Can
+        be a tuple of empty strings if the Conv2D in question does not need an unaligned operator.
+
+    ptr_gens: Tuple[Callable, Callable]
+        A tuple of two functions to generate data and kernel access pointers. They should take as
+        inputs the buffer, (y, x, c) indices, and an alignment offset. They should return a
+        T.tvm_access_ptr object which can be used in T.call_extern.
+
+    output_layout: str
+        The tensor layout that will be prosued by the generated PrimFunc. Should be NHWC or NCHW.
+    """
+
     out_height, out_width, out_channels, num_outputs = output_dimensions
     data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
     aligned_func_name, aligned_func_code = aligned_func
@@ -397,8 +429,46 @@ def _make_unrolled_conv2d_primfunc(
     function_names: Dict[Tuple, str],
     function_code: str,
     ptr_gens: Tuple[Callable, Callable],
-    output_layout="NHWC",
-):
+    output_layout: str = "NHWC",
+) -> tir.function.PrimFunc:
+    """Makes a TIR PrimFunc computing Conv2D using a call to tensordot.
+
+    Can be used to generate regular, depthwise, and grouped Conv2D operators by passing different
+    arguments and ptr_gen functions. Takes some of the same arguments as _make_conv2d_primfunc, but
+    requires the tensordot function variations to be passed differently. The generated PrimFunc is
+    simlar to the one produced by _make_conv2d_primfunc, but unrolls the height and width loops
+    over the input tensor. This results in longer code, but unlike _make_conv2d_primfunc this
+    function does not require the height stride be an even number of words.
+
+    This is required to compute layer 25 in MobileNetV1 models, among other things.
+
+    Parameters
+    ----------
+    output_dimensions : Tuple[int, int, int, int]
+        A tuple containing the out_height, out_width, out_channels, and desired num_outputs values
+        in that order.
+
+    buffer_shapes: Tuple[tvm.ir.container.Array]
+        The shapes of the data, kernel, bias, scale, and output tensors, in that order. Each shape
+        should be a TVM Array.
+
+    function_names: Dict[Tuple, str]
+        A dictionary mapping a tuple of (data, kernel, output) alignments to the name of the
+        appropriate tensordot function.
+
+    function_code: str
+        A string containing all verions of tensordot function our PrimFunc needs. This will usually
+        be a string of 4+ function variations concatenated together.
+
+    ptr_gens: Tuple[Callable, Callable]
+        A tuple of two functions to generate data and kernel access pointers. They should take as
+        inputs the buffer, (y, x, c) indices, and an alignment offset. They should return a
+        T.tvm_access_ptr object which can be used in T.call_extern.
+
+    output_layout: str
+        The tensor layout that will be prosued by the generated PrimFunc. Should be NHWC or NCHW.
+    """
+
     out_height, out_width, out_channels = output_dimensions
     data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
     data_ptr, kernel_ptr = ptr_gens
@@ -413,16 +483,18 @@ def output_ptr(output, y, c):
         else:
             raise TVMError(f"Unsupported out_layout '{output_layout}'!")
 
-    def make_row_call(buffers, c_var, y, c):
+    def make_row_calls(buffers, c_var, out_height):
         output, data, kernel, bias, scale = buffers
-        return _make_tscript_call(
-            function_names[(y + c) % 2, c % 2, 0],
-            output_ptr(output, y, c_var + c),
-            data_ptr(data, y, c_var + c, offset=(y + c) % 2),
-            kernel_ptr(kernel, c_var + c, offset=c),
-            _bias_ptr(bias, c_var + c),
-            _scale_ptr(scale, c_var + c),
-        )
+        for y in range(out_height):
+            for c in range(2):
+                _make_tscript_call(
+                    function_names[(y + c) % 2, c % 2, 0],
+                    output_ptr(output, y, c_var + c),
+                    data_ptr(data, y, c_var + c, offset=(y + c) % 2),
+                    kernel_ptr(kernel, c_var + c, offset=c),
+                    _bias_ptr(bias, c_var + c),
+                    _scale_ptr(scale, c_var + c),
+                )
 
     @T.prim_func
     def biased_quantized_conv2d(
@@ -452,15 +524,7 @@ def biased_quantized_conv2d(
             with T.block("conv2ds"):
                 T.block_attr({"pragma_import_c": function_code})
                 c = T.axis.remap("S", [c_ax]) * 2
-
-                # TODO how can I programatically make the right number of
-                # function calls?
-                make_row_call((output, data, kernel, bias, scale), c, 0, 0)
-                make_row_call((output, data, kernel, bias, scale), c, 1, 0)
-                make_row_call((output, data, kernel, bias, scale), c, 2, 0)
-                make_row_call((output, data, kernel, bias, scale), c, 0, 1)
-                make_row_call((output, data, kernel, bias, scale), c, 1, 1)
-                make_row_call((output, data, kernel, bias, scale), c, 2, 1)
+                make_row_calls((output, data, kernel, bias, scale), c, out_height)
 
     return biased_quantized_conv2d
 
@@ -494,7 +558,6 @@ def qnn_unrolled_depthwise_conv2d(attrs, inputs, out_type):
     dimensions = (width, kernel_h, kernel_w)
     x_strides = (1, out_channels)
 
-    # data, kernel
     func_names = {}
     impls = []
     for alignment in ((0, 0, 0), (0, 1, 0), (1, 0, 0), (1, 1, 0)):
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
index 3259c26b44f5..f4fc01726af2 100644
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -16,6 +16,8 @@
 # under the License.
 """Arm Cortex-M specific optimizations for quantized operators."""
 
+from typing import Iterable
+
 import numpy as np
 
 from tvm import nd, relay, target
@@ -23,7 +25,13 @@
 from ..nn import qnn_conv2d_alter_layout, qnn_add_alter_layout, qnn_requantize_alter_layout
 
 
-def prev_ops_match(curr_op, pattern):
+def prev_ops_match(curr_op: relay.expr.Call, pattern: Iterable[str]):
+    """Checks if the names of nested Relay operators match a pattern.
+
+    Note this function considers `curr_op` as a linear stack of operators, only considering args[0]
+    when traversing backwards. `pattern` should be an Iterable of operator names, written backwards
+    from last to first.
+    """
     prev_op = curr_op
     for op_name in pattern:
         if (not hasattr(prev_op, "op")) or prev_op.op.name != op_name:
diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
index 9e1298ab7bf6..caa92d8c3c22 100644
--- a/python/tvm/topi/arm_cpu/qnn_legalize.py
+++ b/python/tvm/topi/arm_cpu/qnn_legalize.py
@@ -90,11 +90,6 @@ def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
     bias_data = bias_add_op.args[1].data.numpy()
 
     kernel_size = get_const_tuple(depthwise_op.attrs.kernel_size)
-
-    # Make a kernel_size x kernel_size array of fixed_input
-    # Pad it with zeros usint padding
-    # Do a convolution and make sure
-
     fixed_outputs = {}
 
     for i, fixed_input in fixed_channel_inputs.items():
@@ -116,8 +111,8 @@ def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
         if np.all(clipped == clipped[0, 0]):
             fixed_outputs[i] = clipped[0, 0]
 
-    # TODO look for all-zero entries in the depthwise kernel. I don't think these really occur in
-    # practice, but it would be nice for theoretical completeness.
+    # TODO @guberti look for all-zero entries in the depthwise kernel. I don't think these really
+    # occur in practice, but it would be nice for theoretical completeness.
 
     return fixed_outputs
 
@@ -299,8 +294,8 @@ def _densify_conv_pool_dense_pattern(attrs, inputs):
 def legalize_bias_add(attrs, inputs, _tinfos):
     """Remove empty convolution channels when possible, and "fold" them into the bias add.
 
-    TODO these rewrites are always beneficial and will improve performance cross-platform - should
-    we enable them for all platforms, not just arm_cpu?
+    TODO @guberti: these rewrites are always beneficial and will improve performance cross-platform,
+    should we enable them for all platforms, not just arm_cpu?
     """
 
     if prev_ops_match(
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 3e912a204eda..a9b7390c0374 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -1001,7 +1001,8 @@ Pass SimplifyExprPostAlterOp() {
 }
 
 TVM_REGISTER_GLOBAL("relay._transform.SimplifyExpr").set_body_typed(SimplifyExpr);
-// Don't globally register SimplifyExprPostAlterOp
+TVM_REGISTER_GLOBAL("relay._transform.SimplifyExprPostAlterOp")
+    .set_body_typed(SimplifyExprPostAlterOp);
 
 }  // namespace transform
 
diff --git a/tests/python/relay/qnn/test_qnn_channel_stripping.py b/tests/python/relay/qnn/test_qnn_channel_stripping.py
new file mode 100644
index 000000000000..471e0ba39bdd
--- /dev/null
+++ b/tests/python/relay/qnn/test_qnn_channel_stripping.py
@@ -0,0 +1,284 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test QNN channel stripping legalization pass."""
+
+import numpy as np
+import tvm
+from tvm import nd, relay
+
+from tvm.relay import transform
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+
+from tvm.topi.arm_cpu.qnn_legalize import legalize_bias_add
+
+
+def run_opt_pass(expr, passes):
+    passes = passes if isinstance(passes, list) else [passes]
+    mod = tvm.IRModule.from_expr(expr)
+    seq = tvm.transform.Sequential(passes)
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+    entry = mod["main"]
+    return entry if isinstance(expr, relay.Function) else entry.body
+
+
+def tvm_const(obj):
+    return relay.Constant(nd.array(obj))
+
+
+def make_test_conv_depthwise_conv():
+    input_var = relay.var("x", shape=(1, 12, 12, 4), dtype="int8")
+
+    kernel_1 = np.array(
+        [[0, 1, 0, -2], [0, 3, 0, 5], [0, 5, 0, -9], [0, 2, 0, 21]], dtype="int8"
+    ).reshape((1, 1, 4, 4))
+    input_scale_1 = np.float32(0.5)
+    output_scale_1 = np.array([0.5, 2.0, 0.25, 4.0], dtype="float32")
+
+    out = relay.qnn.op.conv2d(
+        input_var,
+        tvm_const(kernel_1),
+        tvm_const(np.int32(-128)),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_1),
+        tvm_const(output_scale_1),
+        channels=4,
+        kernel_size=(1, 1),
+        padding=(0, 0),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+
+    bias_1 = np.array([198, -2, 19, 10], dtype="int32")
+    out = relay.nn.bias_add(
+        out,
+        tvm_const(bias_1),
+        axis=3,
+    )
+
+    input_scale_2 = np.float32(0.25)
+    out = relay.qnn.op.requantize(
+        out,
+        tvm_const(input_scale_1 * output_scale_1),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_2),
+        tvm_const(np.int32(-128)),
+        axis=3,
+        out_dtype="int8",
+    )
+    # Outputs here will be fixed to {0: 70, 2: -118}
+
+    kernel_2 = np.array(
+        [
+            [0, 6, 4, 2],
+            [8, 6, -3, -1],
+            [-2, -5, 3, -8],
+            [-7, 5, 1, 9],
+            [-4, -9, -8, -2],
+            [-1, 4, -5, 3],
+            [-4, -9, 2, 6],
+            [9, -6, 0, 5],
+            [-3, 8, 1, -7],
+        ],
+        dtype="int8",
+    ).reshape((3, 3, 4, 1))
+    output_scale_2 = np.array([0.25, 0.125, 2.0, 0.125], dtype="float32")
+    out = relay.qnn.op.conv2d(
+        out,
+        tvm_const(kernel_2),
+        tvm_const(np.int32(-128)),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_2),
+        tvm_const(output_scale_2),
+        channels=4,
+        groups=4,
+        kernel_size=(3, 3),
+        padding=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    bias_2 = np.array([4582, 4, -12, 15], dtype="int32")
+    out = relay.nn.bias_add(
+        out,
+        tvm_const(bias_2),
+        axis=3,
+    )
+
+    input_scale_3 = np.float32(0.125)
+    out = relay.qnn.op.requantize(
+        out,
+        tvm_const(input_scale_2 * output_scale_2),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_3),
+        tvm_const(np.int32(-128)),
+        axis=3,
+        out_dtype="int8",
+    )
+    # Outputs here will be fixed to {0: 127, 2: -128}
+
+    kernel_3 = np.array(
+        [[4, -2, 9, 9], [0, 0, 0, 0], [0, 0, 0, 0], [-1, 1, -1, 1]], dtype="int8"
+    ).reshape((1, 1, 4, 4))
+    output_scale_3 = np.array([0.25, 0.125, 1.0, 0.5], dtype="float32")
+
+    out = relay.qnn.op.conv2d(
+        out,
+        tvm_const(kernel_3),
+        tvm_const(np.int32(-128)),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_3),
+        tvm_const(output_scale_3),
+        channels=4,
+        kernel_size=(1, 1),
+        padding=(0, 0),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+
+    bias_3 = np.array([1, -1, 4, 6], dtype="int32")
+    out = relay.nn.bias_add(
+        out,
+        tvm_const(bias_3),
+        axis=3,
+    )
+
+    return relay.Function([input_var], out)
+
+
+def make_expected_conv_depthwise_conv():
+    input_var = relay.var("x", shape=(1, 12, 12, 4), dtype="int8")
+
+    kernel_1 = np.array([[1, -2], [3, 5], [5, -9], [2, 21]], dtype="int8").reshape((1, 1, 4, 2))
+    input_scale_1 = np.float32(0.5)
+    output_scale_1 = np.array([2.0, 4.0], dtype="float32")
+
+    out = relay.qnn.op.conv2d(
+        input_var,
+        tvm_const(kernel_1),
+        tvm_const(np.int32(-128)),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_1),
+        tvm_const(output_scale_1),
+        channels=2,
+        kernel_size=(1, 1),
+        padding=(0, 0),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+
+    bias_1 = np.array([-2, 10], dtype="int32")
+    out = relay.nn.bias_add(
+        out,
+        tvm_const(bias_1),
+        axis=3,
+    )
+
+    input_scale_2 = np.float32(0.25)
+    out = relay.qnn.op.requantize(
+        out,
+        tvm_const(input_scale_1 * output_scale_1),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_2),
+        tvm_const(np.int32(-128)),
+        axis=3,
+        out_dtype="int8",
+    )
+    # Outputs here will be fixed to {0: 70, 2: -118}
+
+    kernel_2 = np.array(
+        [
+            [6, 2],
+            [6, -1],
+            [-5, -8],
+            [5, 9],
+            [-9, -2],
+            [4, 3],
+            [-9, 6],
+            [-6, 5],
+            [8, -7],
+        ],
+        dtype="int8",
+    ).reshape((3, 3, 2, 1))
+    output_scale_2 = np.array([0.125, 0.125], dtype="float32")
+    out = relay.qnn.op.conv2d(
+        out,
+        tvm_const(kernel_2),
+        tvm_const(np.int32(-128)),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_2),
+        tvm_const(output_scale_2),
+        channels=2,
+        groups=2,
+        kernel_size=(3, 3),
+        padding=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    bias_2 = np.array([4, 15], dtype="int32")
+    out = relay.nn.bias_add(
+        out,
+        tvm_const(bias_2),
+        axis=3,
+    )
+
+    input_scale_3 = np.float32(0.125)
+    out = relay.qnn.op.requantize(
+        out,
+        tvm_const(input_scale_2 * output_scale_2),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_3),
+        tvm_const(np.int32(-128)),
+        axis=3,
+        out_dtype="int8",
+    )
+    # Outputs here will be fixed to {0: 127, 2: -128}
+
+    kernel_3 = np.array([[0, 0, 0, 0], [-1, 1, -1, 1]], dtype="int8").reshape((1, 1, 2, 4))
+    output_scale_3 = np.array([0.25, 0.125, 1.0, 0.5], dtype="float32")
+
+    out = relay.qnn.op.conv2d(
+        out,
+        tvm_const(kernel_3),
+        tvm_const(np.int32(-128)),
+        tvm_const(np.int32(0)),
+        tvm_const(input_scale_3),
+        tvm_const(output_scale_3),
+        channels=4,
+        kernel_size=(1, 1),
+        padding=(0, 0),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+
+    bias_3 = np.array([1, -1, 4, 6], dtype="int32")
+    out = relay.nn.bias_add(
+        out,
+        tvm_const(bias_3),
+        axis=3,
+    )
+
+    return relay.Function([input_var], out)
+
+
+def test_cdc_channel_stripping_matches_expected():
+    with TempOpAttr("nn.bias_add", "FTVMLegalize", legalize_bias_add):
+        before = make_test_conv_depthwise_conv()
+        actual = run_opt_pass(before, transform.Legalize())
+        expected = run_opt_pass(make_expected_conv_depthwise_conv(), transform.InferType())
+    tvm.ir.assert_structural_equal(actual, expected)
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
index bbf7003e61aa..8af49ca08f7f 100644
--- a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -43,6 +43,7 @@
 SAMPLE_URL = (
     "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg"
 )
+MODEL_NUM_CONVS = 27
 
 
 @pytest.fixture(scope="module")
@@ -130,10 +131,11 @@ def test_empty_channel_detection(interpreter, layer):
         assert all(x == clipped for x in out_channel_values)
         fixed_channels[i] = clipped
 
-    # We now need to compute values for the following depthwise layer
-    if layer == 26:
+    # Check if we are on the final convolution and skip the next test if so
+    if layer + 1 >= MODEL_NUM_CONVS:
         return
 
+    # We now need to compute values for the following depthwise layer
     depthwise_output = _load_tflite_layer(interpreter, layer + 1)[3][0]
     is_depthwise = _get_mobilenet_v1_layer_attributes(layer + 1)[2]
     assert is_depthwise

From c69a74245f65022e703c2c3f4b297cf9d365d598 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Sat, 11 Feb 2023 12:55:11 -0800
Subject: [PATCH 12/19] InferTypeHack

---
 src/relay/qnn/op/dense.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 48f2a813d0e7..b5ba33051447 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -242,7 +242,7 @@ RELAY_REGISTER_OP("qnn.dense")
                   "The quantization zero_point of the weight tensor.")
     .set_support_level(11)
     .add_type_rel("QDense", QnnDenseRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnDenseInferCorrectLayout)
+//    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnDenseInferCorrectLayout)
     .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize)
     .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);

From ea8c9f49467a52ca338fc7c7926cb762bbbd88ee Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Sun, 12 Feb 2023 14:12:05 -0800
Subject: [PATCH 13/19] Slightly flaky channel stripping test

---
 .../relay/qnn/test_qnn_channel_stripping.py   | 173 ++++++++++--------
 1 file changed, 94 insertions(+), 79 deletions(-)

diff --git a/tests/python/relay/qnn/test_qnn_channel_stripping.py b/tests/python/relay/qnn/test_qnn_channel_stripping.py
index 471e0ba39bdd..f5f56fa44ead 100644
--- a/tests/python/relay/qnn/test_qnn_channel_stripping.py
+++ b/tests/python/relay/qnn/test_qnn_channel_stripping.py
@@ -22,6 +22,7 @@
 
 from tvm.relay import transform
 from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.testing.aot import generate_ref_data
 
 from tvm.topi.arm_cpu.qnn_legalize import legalize_bias_add
 
@@ -36,11 +37,19 @@ def run_opt_pass(expr, passes):
     return entry if isinstance(expr, relay.Function) else entry.body
 
 
+def execute_relay_func(relay_func, in_data):
+    ref_module = tvm.IRModule.from_expr(relay_func)
+    return generate_ref_data(ref_module, {"input": in_data})["output"]
+
+
 def tvm_const(obj):
     return relay.Constant(nd.array(obj))
 
 
 def make_test_conv_depthwise_conv():
+    """Generates a convolution -> depthwise_convolution -> convolution pattern that can have
+    channels stripped. The structure here mirrors MobileNetV1's layers 8-10."""
+
     input_var = relay.var("x", shape=(1, 12, 12, 4), dtype="int8")
 
     kernel_1 = np.array(
@@ -160,125 +169,131 @@ def make_test_conv_depthwise_conv():
     return relay.Function([input_var], out)
 
 
-def make_expected_conv_depthwise_conv():
-    input_var = relay.var("x", shape=(1, 12, 12, 4), dtype="int8")
+def make_test_conv_pool_dense():
+    """Generates a convolution -> pool -> dense pattern that can have channels stripped. The
+    structure here mirrors MobileNetV1's final few layers."""
 
-    kernel_1 = np.array([[1, -2], [3, 5], [5, -9], [2, 21]], dtype="int8").reshape((1, 1, 4, 2))
-    input_scale_1 = np.float32(0.5)
-    output_scale_1 = np.array([2.0, 4.0], dtype="float32")
+    input_var = relay.var("x", shape=(1, 3, 3, 4), dtype="int8")
+
+    kernel = np.array(
+        [[0, 1, 0, -2], [0, 3, 0, 5], [0, 5, 0, -9], [0, 2, 0, 21]], dtype="int8"
+    ).reshape((1, 1, 4, 4))
+    input_scale = np.float32(0.029626124)
+    output_scale = np.array([0.5, 2.0, 0.25, 4.0], dtype="float32")
 
     out = relay.qnn.op.conv2d(
         input_var,
-        tvm_const(kernel_1),
+        tvm_const(kernel),
         tvm_const(np.int32(-128)),
         tvm_const(np.int32(0)),
-        tvm_const(input_scale_1),
-        tvm_const(output_scale_1),
-        channels=2,
+        tvm_const(input_scale),
+        tvm_const(output_scale),
+        channels=4,
         kernel_size=(1, 1),
         padding=(0, 0),
         data_layout="NHWC",
         kernel_layout="HWIO",
     )
 
-    bias_1 = np.array([-2, 10], dtype="int32")
+    bias_1 = np.array([198, -2, 19, 10], dtype="int32")
     out = relay.nn.bias_add(
         out,
         tvm_const(bias_1),
         axis=3,
     )
 
-    input_scale_2 = np.float32(0.25)
     out = relay.qnn.op.requantize(
         out,
-        tvm_const(input_scale_1 * output_scale_1),
+        tvm_const(input_scale * output_scale),
         tvm_const(np.int32(0)),
-        tvm_const(input_scale_2),
+        tvm_const(np.float32(0.015656913)),
         tvm_const(np.int32(-128)),
         axis=3,
         out_dtype="int8",
     )
-    # Outputs here will be fixed to {0: 70, 2: -118}
 
-    kernel_2 = np.array(
-        [
-            [6, 2],
-            [6, -1],
-            [-5, -8],
-            [5, 9],
-            [-9, -2],
-            [4, 3],
-            [-9, 6],
-            [-6, 5],
-            [8, -7],
-        ],
-        dtype="int8",
-    ).reshape((3, 3, 2, 1))
-    output_scale_2 = np.array([0.125, 0.125], dtype="float32")
-    out = relay.qnn.op.conv2d(
+    out = relay.cast(out, dtype="int32")
+    out = relay.nn.avg_pool2d(
         out,
-        tvm_const(kernel_2),
-        tvm_const(np.int32(-128)),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale_2),
-        tvm_const(output_scale_2),
-        channels=2,
-        groups=2,
-        kernel_size=(3, 3),
-        padding=(1, 1),
-        data_layout="NHWC",
-        kernel_layout="HWOI",
+        pool_size=[3, 3],
+        strides=[3, 3],
+        layout="NHWC",
     )
 
-    bias_2 = np.array([4, 15], dtype="int32")
-    out = relay.nn.bias_add(
-        out,
-        tvm_const(bias_2),
-        axis=3,
-    )
+    out = relay.cast(out, dtype="int8")
+    # The channel stripping logic expects two reshape operators
+    out = relay.reshape(out, newshape=[-1, 4])
+    out = relay.reshape(out, newshape=[-1, 4])
 
-    input_scale_3 = np.float32(0.125)
-    out = relay.qnn.op.requantize(
+    dense_weights = np.array([[15, -2, -3, 11], [12, -10, 13, -10]], dtype="int8")
+    out = relay.qnn.op.dense(
         out,
-        tvm_const(input_scale_2 * output_scale_2),
-        tvm_const(np.int32(0)),
-        tvm_const(input_scale_3),
-        tvm_const(np.int32(-128)),
-        axis=3,
-        out_dtype="int8",
-    )
-    # Outputs here will be fixed to {0: 127, 2: -128}
-
-    kernel_3 = np.array([[0, 0, 0, 0], [-1, 1, -1, 1]], dtype="int8").reshape((1, 1, 2, 4))
-    output_scale_3 = np.array([0.25, 0.125, 1.0, 0.5], dtype="float32")
-
-    out = relay.qnn.op.conv2d(
-        out,
-        tvm_const(kernel_3),
+        tvm_const(dense_weights),
         tvm_const(np.int32(-128)),
         tvm_const(np.int32(0)),
-        tvm_const(input_scale_3),
-        tvm_const(output_scale_3),
-        channels=4,
-        kernel_size=(1, 1),
-        padding=(0, 0),
-        data_layout="NHWC",
-        kernel_layout="HWIO",
+        tvm_const(np.float32(0.015656913)),
+        tvm_const(np.float32(0.0047202893)),
+        units=2,
+        out_dtype="int32",
     )
 
-    bias_3 = np.array([1, -1, 4, 6], dtype="int32")
+    dense_bias = np.array([1463, -1463], dtype="int32")
     out = relay.nn.bias_add(
         out,
-        tvm_const(bias_3),
-        axis=3,
+        tvm_const(dense_bias),
+        axis=1,
     )
 
     return relay.Function([input_var], out)
 
 
-def test_cdc_channel_stripping_matches_expected():
+def test_conv_depthwise_conv():
+    """Make sure that qnn_legalize.py is able to detect and remove empty output channels from a
+    convolution -> depthwise convolution -> convolution pattern by folding into a bias_add op."""
+
+    original = make_test_conv_depthwise_conv()
+
     with TempOpAttr("nn.bias_add", "FTVMLegalize", legalize_bias_add):
-        before = make_test_conv_depthwise_conv()
-        actual = run_opt_pass(before, transform.Legalize())
-        expected = run_opt_pass(make_expected_conv_depthwise_conv(), transform.InferType())
-    tvm.ir.assert_structural_equal(actual, expected)
+        unoptimized = run_opt_pass(original, transform.InferType())
+        optimized = run_opt_pass(original, transform.Legalize())
+
+    # Inputs and outputs should be unmodified by channel stripping
+    assert unoptimized.checked_type == optimized.checked_type
+
+    # Make sure 2/4 channels were removed by channel stripping
+    assert tuple(unoptimized.body.args[0].args[0].checked_type.shape) == (1, 12, 12, 4)
+    assert tuple(optimized.body.args[0].args[0].checked_type.shape) == (1, 12, 12, 2)
+
+    # Make sure optimized and unoptimized versions behave identically
+    np.random.seed(12402)  # Fix seed for repeatability
+    input_data = np.random.randint(-128, 128, size=(1, 12, 12, 4), dtype="int8")
+
+    unoptimized_output = execute_relay_func(unoptimized, np.copy(input_data))
+    optimized_output = execute_relay_func(optimized, np.copy(input_data))
+    np.testing.assert_array_equal(unoptimized_output, optimized_output)
+
+
+def test_conv_pool_dense():
+    """Make sure that qnn_legalize.py is able to detect and remove empty output channels from a
+    convolution -> avg_pool2d -> dense pattern by folding them into a bias_add op."""
+
+    original = make_test_conv_pool_dense()
+
+    with TempOpAttr("nn.bias_add", "FTVMLegalize", legalize_bias_add):
+        unoptimized = run_opt_pass(original, transform.InferType())
+        optimized = run_opt_pass(original, transform.Legalize())
+
+    # Inputs and outputs should be unmodified by channel stripping
+    assert unoptimized.checked_type == optimized.checked_type
+
+    # Make sure 2/4 channels were removed by channel stripping
+    assert tuple(unoptimized.body.args[0].args[0].checked_type.shape) == (1, 4)
+    assert tuple(optimized.body.args[0].args[0].checked_type.shape) == (1, 2)
+
+    # Make sure optimized and unoptimized versions behave identically
+    np.random.seed(12402)  # Fix seed for repeatability
+    input_data = np.random.randint(-128, 128, size=(1, 3, 3, 4), dtype="int8")
+
+    unoptimized_output = execute_relay_func(unoptimized, np.copy(input_data))
+    optimized_output = execute_relay_func(optimized, np.copy(input_data))
+    np.testing.assert_array_equal(unoptimized_output, optimized_output)

From 06c1c8aad5ca27dccebae60c374e4c12ceffc9f1 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Sun, 12 Feb 2023 14:50:53 -0800
Subject: [PATCH 14/19] Clip legalization stripping test

---
 python/tvm/topi/arm_cpu/qnn_legalize.py       | 42 ++++++++-
 .../relay/qnn/test_clip_legalization.py       | 87 +++++++++++++++++++
 2 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/relay/qnn/test_clip_legalization.py

diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
index caa92d8c3c22..c93fac57f613 100644
--- a/python/tvm/topi/arm_cpu/qnn_legalize.py
+++ b/python/tvm/topi/arm_cpu/qnn_legalize.py
@@ -38,7 +38,26 @@
 
 
 def _compute_fixed_conv2d_outputs(requantize_op):
-    """Compute all conv2d output values that do not depend on the layer input."""
+    """Compute all conv2d output values that do not depend on the layer input.
+
+    Parameters
+    ----------
+    requantize_op : relay.expr.Call
+        A qnn.requantize Relay operator, which must be preceeded by a nn.bias_add op and a
+        qnn.conv2d operator. The qnn.conv2d operator must have groups==1. All arguments to all three
+        operators, besides the main tensor, must be constants.
+
+    Returns
+    -------
+    fixed_outputs : Dict[int, int]
+        A dictionary showing which of the conv2d -> bias_add -> requantize output channels are
+        "fixed" - i.e. those that do not depend on the input tensor. Each key in the dictionary is
+        an output channel index, and each value is the value that all entries in that output channel
+        will have. If the block has no fixed output channels, this dictionary will be empty.
+    """
+
+    import pdb
+    pdb.set_trace()
     bias_add_op = requantize_op.args[0]
     conv2d_op = bias_add_op.args[0]
 
@@ -74,7 +93,28 @@ def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
     of our depthwise operator does NOT guarantee we can remove the output, because of padding.
     This function checks if the padding makes a difference in the outputs, and if not, removes
     the channels from the depthwise_conv2d.
+
+    Parameters
+    ----------
+    requantize_op : relay.expr.Call
+        A qnn.requantize Relay operator, which must be preceeded by a nn.bias_add op and a
+        qnn.conv2d operator. The qnn.conv2d operator must be depthwise. All arguments to all three
+        operators, besides the main tensor, must be constants.
+
+    fixed_channel_inputs : Dict[int, int]
+        A dictionary showing which input channels to the qnn.conv2d operator have fixed values, and
+        what those values are fixed to. Can be empty. Usually, this will be generated by
+        _compute_fixed_conv2d_outputs.
+
+    Returns
+    -------
+    fixed_outputs : Dict[int, int]
+        A dictionary showing which of the conv2d -> bias_add -> requantize output channels are
+        "fixed" - i.e. those that do not depend on the input tensor. Each key in the dictionary is
+        an output channel index, and each value is the value that all entries in that output channel
+        will have. If the block has no fixed output channels, this dictionary will be empty.
     """
+
     bias_add_op = requantize_op.args[0]
     depthwise_op = bias_add_op.args[0]
 
diff --git a/tests/python/relay/qnn/test_clip_legalization.py b/tests/python/relay/qnn/test_clip_legalization.py
new file mode 100644
index 000000000000..d796a896ff9b
--- /dev/null
+++ b/tests/python/relay/qnn/test_clip_legalization.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test that do-nothing requantize -> clip operators are removed during legalization."""
+
+import numpy as np
+import pytest
+
+import tvm
+from tvm import nd, relay
+from tvm.relay import transform
+
+
+def run_opt_pass(expr, passes):
+    passes = passes if isinstance(passes, list) else [passes]
+    mod = tvm.IRModule.from_expr(expr)
+    seq = tvm.transform.Sequential(passes)
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+    entry = mod["main"]
+    return entry if isinstance(expr, relay.Function) else entry.body
+
+
+def tvm_const(obj):
+    return relay.Constant(nd.array(obj))
+
+
+@pytest.mark.parametrize(
+    "dtype,min_val,max_val,is_redundant",
+    [
+        ("int8", -128, 127, True),
+        ("int8", -127, 127, False),
+        ("int16", -128, 127, False),
+        ("int32", -2147483648, 2147483647, True),
+    ],
+)
+def test_removes_redundant_requantize_clip_ops(dtype, min_val, max_val, is_redundant):
+    """Test that qnn.requantize -> clip sequences are removed during legalization if the bounds of
+    the clip operator match the min and max values of the data type."""
+
+    input_var = relay.var("x", shape=(1, 3, 3, 4), dtype="int32")
+    out = relay.qnn.op.requantize(
+        input_var,
+        tvm_const(np.float32(1.0)),
+        tvm_const(np.int32(0)),
+        tvm_const(np.float32(1.0)),
+        tvm_const(np.int32(-128)),
+        axis=3,
+        out_dtype=dtype,
+    )
+    out = relay.clip(out, a_min=min_val, a_max=max_val)
+    func = relay.Function([input_var], out)
+    unmodified = run_opt_pass(func, transform.InferType())
+    legalized = run_opt_pass(func, transform.Legalize())
+
+    # Check that the clip op was removed if and only if `is_redundant` is True.
+    if is_redundant:
+        assert legalized.body.op.name == "qnn.requantize"
+        assert not tvm.ir.structural_equal(unmodified, legalized)
+    else:
+        assert legalized.body.op.name == "clip"
+        tvm.ir.assert_structural_equal(unmodified, legalized)
+
+
+def test_ignores_standalone_clip_ops():
+    """The legalization pass should only affect qnn.requantize -> clip sequences, and should leave
+    standalone clip operators untouched."""
+
+    input_var = relay.var("x", shape=(1, 3, 3, 4), dtype="int8")
+    out = relay.clip(input_var, a_min=-128, a_max=127)
+    func = relay.Function([input_var], out)
+    unmodified = run_opt_pass(func, transform.InferType())
+    legalized = run_opt_pass(func, transform.Legalize())
+    tvm.ir.assert_structural_equal(unmodified, legalized)

From 622ece43f9c6706331e4a9d14903d706d5d2a1e7 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Tue, 14 Feb 2023 16:37:02 -0500
Subject: [PATCH 15/19] Fix bug in channel stripping tests

---
 python/tvm/topi/arm_cpu/qnn_legalize.py              | 2 --
 tests/python/relay/qnn/test_clip_legalization.py     | 2 +-
 tests/python/relay/qnn/test_qnn_channel_stripping.py | 4 ++--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
index c93fac57f613..125e5839e680 100644
--- a/python/tvm/topi/arm_cpu/qnn_legalize.py
+++ b/python/tvm/topi/arm_cpu/qnn_legalize.py
@@ -56,8 +56,6 @@ def _compute_fixed_conv2d_outputs(requantize_op):
         will have. If the block has no fixed output channels, this dictionary will be empty.
     """
 
-    import pdb
-    pdb.set_trace()
     bias_add_op = requantize_op.args[0]
     conv2d_op = bias_add_op.args[0]
 
diff --git a/tests/python/relay/qnn/test_clip_legalization.py b/tests/python/relay/qnn/test_clip_legalization.py
index d796a896ff9b..d1a9c5901a2d 100644
--- a/tests/python/relay/qnn/test_clip_legalization.py
+++ b/tests/python/relay/qnn/test_clip_legalization.py
@@ -51,7 +51,7 @@ def test_removes_redundant_requantize_clip_ops(dtype, min_val, max_val, is_redun
     """Test that qnn.requantize -> clip sequences are removed during legalization if the bounds of
     the clip operator match the min and max values of the data type."""
 
-    input_var = relay.var("x", shape=(1, 3, 3, 4), dtype="int32")
+    input_var = relay.var("input", shape=(1, 3, 3, 4), dtype="int32")
     out = relay.qnn.op.requantize(
         input_var,
         tvm_const(np.float32(1.0)),
diff --git a/tests/python/relay/qnn/test_qnn_channel_stripping.py b/tests/python/relay/qnn/test_qnn_channel_stripping.py
index f5f56fa44ead..25197ca84c54 100644
--- a/tests/python/relay/qnn/test_qnn_channel_stripping.py
+++ b/tests/python/relay/qnn/test_qnn_channel_stripping.py
@@ -50,7 +50,7 @@ def make_test_conv_depthwise_conv():
     """Generates a convolution -> depthwise_convolution -> convolution pattern that can have
     channels stripped. The structure here mirrors MobileNetV1's layers 8-10."""
 
-    input_var = relay.var("x", shape=(1, 12, 12, 4), dtype="int8")
+    input_var = relay.var("input", shape=(1, 12, 12, 4), dtype="int8")
 
     kernel_1 = np.array(
         [[0, 1, 0, -2], [0, 3, 0, 5], [0, 5, 0, -9], [0, 2, 0, 21]], dtype="int8"
@@ -173,7 +173,7 @@ def make_test_conv_pool_dense():
     """Generates a convolution -> pool -> dense pattern that can have channels stripped. The
     structure here mirrors MobileNetV1's final few layers."""
 
-    input_var = relay.var("x", shape=(1, 3, 3, 4), dtype="int8")
+    input_var = relay.var("input", shape=(1, 3, 3, 4), dtype="int8")
 
     kernel = np.array(
         [[0, 1, 0, -2], [0, 3, 0, 5], [0, 5, 0, -9], [0, 2, 0, 21]], dtype="int8"

From 2649051653baa7e39c1dc6a84d176a304fe23fec Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Wed, 1 Mar 2023 00:11:31 -0500
Subject: [PATCH 16/19] Address some code review comments

---
 python/tvm/relay/qnn/op/_qnn.py         | 22 ++++++---------------
 python/tvm/topi/arm_cpu/qnn_alter_op.py |  4 ++--
 python/tvm/topi/arm_cpu/qnn_legalize.py |  4 ++--
 python/tvm/topi/nn/qnn.py               | 26 ++-----------------------
 4 files changed, 12 insertions(+), 44 deletions(-)

diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index 60dc6fdc6eae..c688c308b852 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -17,6 +17,8 @@
 # pylint: disable=invalid-name, unused-argument, len-as-condition
 """QNN operator feature registration"""
 
+import numpy as np
+
 from tvm import topi, TVMError
 
 from .. import strategy
@@ -91,19 +93,6 @@ def simulated_dequantize_compute(attrs, inputs, output_type):
 register_strategy("qnn.conv2d", strategy.qnn_conv2d_strategy)
 
 
-def _get_clip_dtype_bounds(dtype):
-    """Returns the minimum and maximum values of a C integer data type."""
-    assert "int" in dtype
-    bits = int(dtype[dtype.find("int") + 3 :])
-
-    if dtype.startswith("int"):
-        return (-(2 ** (bits - 1)), 2 ** (bits - 1) - 1)
-    elif dtype.startswith("uint"):
-        return (0, 2**bits - 1)
-    else:
-        raise TVMError(f"Clip legalization is not supported for data type '{dtype}'!")
-
-
 @register_legalize("clip")
 def legalize_clip(attrs, inputs, tinfos):
     """Removes clip operators with bounds matching the defaults for their dtype.
@@ -113,7 +102,8 @@ def legalize_clip(attrs, inputs, tinfos):
     """
 
     if hasattr(inputs[0], "op") and inputs[0].op.name == "qnn.requantize":
-        if _get_clip_dtype_bounds(tinfos[0].dtype) == (attrs.a_min, attrs.a_max):
+        dtype_info = np.iinfo(tinfos[0].dtype)
+        if dtype_info.min == attrs.a_min and dtype_info.max == attrs.a_max:
             return inputs[0]
 
     return None
@@ -127,7 +117,7 @@ def legalize_bias_add(attrs, inputs, tinfos):
     be done before layout rewrites occur to minimize the amount of "extra" overhead operators
     like "cast" and "layout_transform".
     """
-    return topi.nn.qnn_bias_add_legalize(attrs, inputs, tinfos)
+    return topi.nn.bias_add_legalize(attrs, inputs, tinfos)
 
 
 @register_alter_op_layout("qnn.conv2d")
@@ -148,7 +138,7 @@ def alter_op_layout_add(attrs, inputs, tinfos, out_type):
     Useful for fusing the bias constant with an input zero point constant in a previous quantized
     op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
     """
-    return topi.nn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
+    return topi.nn.add_alter_layout(attrs, inputs, tinfos, out_type)
 
 
 @register_alter_op_layout("qnn.requantize")
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
index f4fc01726af2..ff71d81a666c 100644
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -22,7 +22,7 @@
 
 from tvm import nd, relay, target
 from ..utils import get_const_tuple
-from ..nn import qnn_conv2d_alter_layout, qnn_add_alter_layout, qnn_requantize_alter_layout
+from ..nn import qnn_conv2d_alter_layout, add_alter_layout, qnn_requantize_alter_layout
 
 
 def prev_ops_match(curr_op: relay.expr.Call, pattern: Iterable[str]):
@@ -130,7 +130,7 @@ def alter_conv2d_layout(attrs, inputs, _tinfos, _out_type):
     return op
 
 
-@qnn_add_alter_layout.register(["arm_cpu"])
+@add_alter_layout.register(["arm_cpu"])
 def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
     """Fuses the zero point for a previous quantized operator with this add operation.
 
diff --git a/python/tvm/topi/arm_cpu/qnn_legalize.py b/python/tvm/topi/arm_cpu/qnn_legalize.py
index 125e5839e680..2833fbce26f1 100644
--- a/python/tvm/topi/arm_cpu/qnn_legalize.py
+++ b/python/tvm/topi/arm_cpu/qnn_legalize.py
@@ -34,7 +34,7 @@
 from tvm.topi.utils import get_const_tuple
 from tvm import nd, relay
 from .qnn_alter_op import prev_ops_match, edit_attrs
-from ..nn import qnn_bias_add_legalize
+from ..nn import bias_add_legalize
 
 
 def _compute_fixed_conv2d_outputs(requantize_op):
@@ -328,7 +328,7 @@ def _densify_conv_pool_dense_pattern(attrs, inputs):
     return new_op
 
 
-@qnn_bias_add_legalize.register(["arm_cpu"])
+@bias_add_legalize.register(["arm_cpu"])
 def legalize_bias_add(attrs, inputs, _tinfos):
     """Remove empty convolution channels when possible, and "fold" them into the bias add.
 
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
index 1b583ae8069e..98bbb7ebe50f 100644
--- a/python/tvm/topi/nn/qnn.py
+++ b/python/tvm/topi/nn/qnn.py
@@ -213,7 +213,7 @@ def qnn_conv2d_alter_layout(_attrs, _inputs, _tinfos, _out_type):
 
 
 @tvm.target.generic_func
-def qnn_bias_add_legalize(_attrs, _inputs, _tinfos):
+def bias_add_legalize(_attrs, _inputs, _tinfos):
     """Legalize bias_add layout.
 
     Bias add is not a QNN-specific function, but this generic exists so that empty channels can
@@ -233,29 +233,7 @@ def qnn_bias_add_legalize(_attrs, _inputs, _tinfos):
 
 
 @tvm.target.generic_func
-def qnn_clip_legalize(_attrs, inputs, _tinfos, _out_type):
-    """Change clip layout.
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : tvm.relay.Expr
-        Grouped input symbols
-    tinfos : list
-        Input shape and dtype
-    out_type: type
-        The output type
-
-    Note
-    ----
-    Unlike other TOPI functions, this function operates on both graph level and operator level.
-    """
-    return inputs[0]
-
-
-@tvm.target.generic_func
-def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+def add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
     """Change add layout.
 
     Add is not a QNN-specific function, but this generic exists so that bias add operations can be

From 866efc892700c1f61dc253e4b30015a23cd76417 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Wed, 1 Mar 2023 12:30:32 -0500
Subject: [PATCH 17/19] Fix ugly hack

---
 python/tvm/topi/arm_cpu/qnn_alter_op.py | 3 +++
 python/tvm/topi/hexagon/qnn/nn.py       | 2 +-
 src/relay/qnn/op/dense.cc               | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
index ff71d81a666c..31782d69d032 100644
--- a/python/tvm/topi/arm_cpu/qnn_alter_op.py
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -229,6 +229,9 @@ def alter_requantize_layout(attrs, inputs, _tinfos, _out_type):
     if not (current_target.features.has_dsp and "cortex-m" in current_target.mcpu):
         return None
 
+    if not prev_ops_match(inputs[0], ("add", "qnn.conv2d")):
+        return None
+
     _, in_scale, _, out_scale, _ = inputs
     in_scale_numpy = in_scale.data.numpy().astype("float64")
     out_scale_scalar = out_scale.data.numpy().item()
diff --git a/python/tvm/topi/hexagon/qnn/nn.py b/python/tvm/topi/hexagon/qnn/nn.py
index 5702be2e1a33..e60314b82757 100644
--- a/python/tvm/topi/hexagon/qnn/nn.py
+++ b/python/tvm/topi/hexagon/qnn/nn.py
@@ -874,7 +874,7 @@ def qnn_dense(
 
     # Add bias
     if bias is not None:
-        out = te.compute(out.shape, lambda n, c: out[n, c] + bias[c])
+        out = te.compute(out.shape, lambda n, c: out[n, c] + bias[0, c])
 
     # Requantize output of dense
     # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index b5ba33051447..48f2a813d0e7 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -242,7 +242,7 @@ RELAY_REGISTER_OP("qnn.dense")
                   "The quantization zero_point of the weight tensor.")
     .set_support_level(11)
     .add_type_rel("QDense", QnnDenseRel)
-//    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnDenseInferCorrectLayout)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnDenseInferCorrectLayout)
     .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize)
     .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);

From bf86d9f8c25e105a12c1b96ab0a4cc977ee088b7 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Wed, 1 Mar 2023 12:51:13 -0500
Subject: [PATCH 18/19] Remove unused TVMError

---
 python/tvm/relay/qnn/op/_qnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index c688c308b852..278ce7ee23c8 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from tvm import topi, TVMError
+from tvm import topi
 
 from .. import strategy
 from ...op.op import register_compute

From 698f6e1fabebf9578b278201c9c22910a5642351 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Thu, 2 Mar 2023 14:19:51 -0500
Subject: [PATCH 19/19] Change codegen_c.cc name allocation

---
 src/target/source/codegen_c.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 7553e0aa1570..514ec8395821 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -632,9 +632,8 @@ void CodeGenC::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs,
   }
 }
 
-NameSupply global_name_supply = NameSupply("");
 void CodeGenC::VisitStmt_(const AllocateConstNode* op) {
-  std::string symbol_name = global_name_supply->FreshName(op->buffer_var->name_hint);
+  std::string symbol_name = AllocVarID(op->buffer_var.get());
 
   int64_t num_elements = 1;
   const auto& data = op->data.value();