From 0b5b0e80f93309a7e017010fcccfef045cdec47b Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Mon, 7 Apr 2025 20:46:33 -0700
Subject: [PATCH 01/24] WIP: add initial support for dq 2D conv

---
 .../xnnpack/partition/config/gemm_configs.py  |  6 ++
 .../xnnpack/quantizer/xnnpack_quantizer.py    |  1 +
 .../quantizer/xnnpack_quantizer_utils.py      | 11 +++
 backends/xnnpack/test/ops/test_conv2d.py      | 86 ++++++++++++++++++-
 4 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index 8712c2709ac..a05bf623e05 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -358,6 +358,11 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             why(node, "Only support 1D + 2D Conv")
             return False  # Only support 1D + 2D Conv
 
+        precision = self._detect_precision(node)
+        if precision == ConfigPrecisionType.DYNAMIC_QUANT and len(conv_stride) != 2:
+            why(node, "Only support 2D Conv for dynamic quantization")
+            return False
+
         kernel_node = get_input_node(node, 1)
         weight_quant_params = QuantParams.from_weights(kernel_node, ep)
 
@@ -394,6 +399,7 @@ def supported_precision_types(self):
         return [
             ConfigPrecisionType.FP32,
             ConfigPrecisionType.STATIC_QUANT,
+            ConfigPrecisionType.DYNAMIC_QUANT,
         ]
 
 
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer.py b/backends/xnnpack/quantizer/xnnpack_quantizer.py
index 0ddee53a41a..fdabd0383e6 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer.py
@@ -265,6 +265,7 @@ class XNNPACKQuantizer(Quantizer):
 
     DYNAMIC_OPS = [
         "linear",
+        "conv",
     ]
 
     def __init__(self) -> None:
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index ce459806c6e..4763e39fa2f 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -304,6 +304,17 @@ def _do_annotate_conv(
     for n in gm.graph.nodes:
         if not is_conv_node(n):
             continue
+
+        # TODO: Check for dynamically quantized convs and check if nn.Conv2d is always lowered
+        # Only dynamically quantize 2D convolutions
+        # Handle both nn.Conv2d and aten.conv2d.default
+        if n.op == "call_module":
+            mod = gm.get_submodule(n.target)
+            if not hasattr(mod, "padding") or len(mod.padding) != 2:
+                continue
+        elif n.op == "call_function" and n.target != torch.ops.aten.conv2d.default:
+            continue
+
         conv_node = n
 
         # This is hacky!
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 80b731bd18e..5001d2b6e88 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -18,6 +18,10 @@
 except:
     has_quantized_ops = False
 
+from executorch.backends.xnnpack.partition.config.xnnpack_config import (
+    ConfigPrecisionType,
+)
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
 )
@@ -26,7 +30,10 @@
 )
 from executorch.backends.xnnpack.test.test_xnnpack_utils import randomize_bn
 from executorch.backends.xnnpack.test.tester import Quantize, Tester
-
+from executorch.backends.xnnpack.test.tester.tester import (
+    Partition,
+    ToEdgeTransformAndLower,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -223,6 +230,61 @@ def _test(
                     .run_method_and_compare_outputs(qtol=1)
                 )
 
+    def _test_dq_conv2d(
+        self,
+        m: torch.nn.Module,
+        inputs,
+        dynamic_shapes,
+        atol=5e-02,
+    ):
+        quant_config = get_symmetric_quantization_config(
+            is_per_channel=True,
+            is_dynamic=True,
+            act_qmin=-128,
+            act_qmax=127,
+            weight_qmin=-128,
+            weight_qmax=127,
+        )
+
+        DynamicallyQuantizedPartitioner = XnnpackPartitioner(
+            config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
+            per_op_mode=False,
+        )
+
+        tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
+        tester = tester.quantize(Quantize(quantization_config=quant_config))
+
+        # Print after quantization
+        tester.stages["quantize"] = tester.stages[tester.cur]
+        print("\n----------Annotated Graph:")
+        print(tester.stages["quantize"].graph_module.code)
+
+        exported = tester.export()
+
+        # Print after exporting
+        tester.stages["export"] = exported.stages[exported.cur]
+        print("\n----------Exported Graph:")
+        print(tester.stages["export"].graph_module.code)
+
+        # Check for choose_qparams
+        tester.check(["torch.ops.quantized_decomposed.choose_qparams"])
+
+        tester.to_edge_transform_and_lower(
+            ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
+        )
+
+        # Print after lower and partition
+        print("\n----------Lowered Graph:")
+        print(tester.stages[tester.cur].graph_module.code)
+
+        tester.check(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
+        tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+        tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"])
+
+        tester.to_executorch()
+        tester.serialize()
+        tester.run_method_and_compare_outputs(atol=atol)
+
     def test_fp16_conv2d(self) -> None:
         for transpose in (True, False):
             for has_bias in (True, False):
@@ -699,3 +761,25 @@ def forward(self, x):
             .serialize()
             .run_method_and_compare_outputs(qtol=1)
         )
+
+    def test_dq_conv2d(self) -> None:
+        class SimpleConv2d(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 2, 3)
+                self.conv.weight.requires_grad = False
+                self.conv.bias.requires_grad = False
+
+            def forward(self, x):
+                return self.conv(x)
+
+            def get_inputs(self):
+                return (torch.randn(1, 1, 8, 8),)
+
+        model = SimpleConv2d()
+        self._test_dq_conv2d(
+            model,
+            model.get_inputs(),
+            dynamic_shapes=None,
+            atol=5e-2,
+        )

From 8fcb1170dcddb39e70a2072227ac9e990773b477 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Fri, 11 Apr 2025 17:25:31 -0700
Subject: [PATCH 02/24] Permute before quant

---
 .../channels_last_tagged_reshape_pass.py      | 28 +++++++++++++++++--
 .../xnnpack/quantizer/xnnpack_quantizer.py    |  2 ++
 .../quantizer/xnnpack_quantizer_utils.py      | 10 -------
 backends/xnnpack/runtime/XNNCompiler.cpp      |  2 +-
 backends/xnnpack/test/ops/test_conv2d.py      | 25 ++++-------------
 5 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 89a44f303df..ca8de0c32df 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -282,16 +282,38 @@ def input_to_nhwc(
                 ChannelsLastTaggedReshapePass.PARTNER_NODE
             ]
         else:
-            # Need to create NHWC node
-            with graph_module.graph.inserting_after(input_node):
+            # trace back to permute
+            origin = input_node
+            while hasattr(origin, "args") and isinstance(origin.args, tuple) and len(origin.args) > 0:
+                origin = origin.args[0]
+
+            # at x choose_qparams and quantize insert permute
+            with graph_module.graph.inserting_after(origin):
                 input_node_nhwc = self.create_call_function_node(
                     graph_module=graph_module,
                     target=exir_ops.edge.aten._to_copy.default,
-                    args=(input_node,),
+                    args=(origin,),
                     memory_format=torch.channels_last,
                 )
+
+            for user in list(origin.users):
+                if user != input_node_nhwc:
+                    user.replace_input_with(origin, input_node_nhwc)
+
+            graph_module.recompile()
             self.mark_as_nhwc_node(input_node_nhwc)
 
+        # TODO: uncomment, use case when permute not needed
+        #     # Need to create NHWC node                     ----------------------------- CONVERSION HAPPENING ----->>
+        #     with graph_module.graph.inserting_after(input_node):
+        #         input_node_nhwc = self.create_call_function_node(
+        #             graph_module=graph_module,
+        #             target=exir_ops.edge.aten._to_copy.default,
+        #             args=(input_node,),
+        #             memory_format=torch.channels_last,
+        #         )
+        #     self.mark_as_nhwc_node(input_node_nhwc)
+
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
             original_input=input_node,
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer.py b/backends/xnnpack/quantizer/xnnpack_quantizer.py
index fdabd0383e6..9e24d7b0030 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer.py
@@ -71,8 +71,10 @@ def _supported_symmetric_quantized_operators() -> dict[str, list[OperatorPattern
         "conv2d": [
             [torch.nn.Conv2d, torch.nn.ReLU],
             [torch.nn.Conv2d, F.relu],
+            [torch.nn.Conv2d],
             [F.conv2d, torch.nn.ReLU],
             [F.conv2d, F.relu],
+            [F.conv2d],
         ],
         "linear": [[torch.nn.Linear], [F.linear]],
         "add": [[torch.add]],
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index 4763e39fa2f..91babc26cc9 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -305,16 +305,6 @@ def _do_annotate_conv(
         if not is_conv_node(n):
             continue
 
-        # TODO: Check for dynamically quantized convs and check if nn.Conv2d is always lowered
-        # Only dynamically quantize 2D convolutions
-        # Handle both nn.Conv2d and aten.conv2d.default
-        if n.op == "call_module":
-            mod = gm.get_submodule(n.target)
-            if not hasattr(mod, "padding") or len(mod.padding) != 2:
-                continue
-        elif n.op == "call_function" and n.target != torch.ops.aten.conv2d.default:
-            continue
-
         conv_node = n
 
         # This is hacky!
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index c0204831c07..988dab86ab7 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1172,7 +1172,7 @@ Error defineStaticTransposeNode(
   ET_CHECK_OR_RETURN_ERROR(
       status == xnn_status_success,
       Internal,
-      "Failed to create sigmoid node %i with code: %s",
+      "Failed to create static transpose node %i with code: %s",
       node->debug_handle(),
       xnn_status_to_string(status));
 
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 5001d2b6e88..dbd9f75ecaf 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -240,10 +240,6 @@ def _test_dq_conv2d(
         quant_config = get_symmetric_quantization_config(
             is_per_channel=True,
             is_dynamic=True,
-            act_qmin=-128,
-            act_qmax=127,
-            weight_qmin=-128,
-            weight_qmax=127,
         )
 
         DynamicallyQuantizedPartitioner = XnnpackPartitioner(
@@ -254,35 +250,26 @@ def _test_dq_conv2d(
         tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
         tester = tester.quantize(Quantize(quantization_config=quant_config))
 
-        # Print after quantization
         tester.stages["quantize"] = tester.stages[tester.cur]
-        print("\n----------Annotated Graph:")
-        print(tester.stages["quantize"].graph_module.code)
 
         exported = tester.export()
 
-        # Print after exporting
         tester.stages["export"] = exported.stages[exported.cur]
-        print("\n----------Exported Graph:")
-        print(tester.stages["export"].graph_module.code)
 
-        # Check for choose_qparams
         tester.check(["torch.ops.quantized_decomposed.choose_qparams"])
 
         tester.to_edge_transform_and_lower(
             ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
         )
 
-        # Print after lower and partition
-        print("\n----------Lowered Graph:")
-        print(tester.stages[tester.cur].graph_module.code)
-
-        tester.check(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
         tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
         tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"])
 
         tester.to_executorch()
-        tester.serialize()
+
+        #tester.serialize()
+        tester.serialize().dump_artifact("conv2d.pte")
+
         tester.run_method_and_compare_outputs(atol=atol)
 
     def test_fp16_conv2d(self) -> None:
@@ -766,7 +753,7 @@ def test_dq_conv2d(self) -> None:
         class SimpleConv2d(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(1, 2, 3)
+                self.conv = torch.nn.Conv2d(3, 10, 3, )
                 self.conv.weight.requires_grad = False
                 self.conv.bias.requires_grad = False
 
@@ -774,7 +761,7 @@ def forward(self, x):
                 return self.conv(x)
 
             def get_inputs(self):
-                return (torch.randn(1, 1, 8, 8),)
+                return (torch.randn(1, 3, 8, 8),)
 
         model = SimpleConv2d()
         self._test_dq_conv2d(

From 4d064da53ab0f3c39cb124b8f5f0865eb826e92d Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Fri, 11 Apr 2025 20:20:33 -0700
Subject: [PATCH 03/24] Refactor permute code

---
 .../channels_last_tagged_reshape_pass.py      | 28 ++++++++-----------
 backends/xnnpack/test/ops/test_conv2d.py      | 10 +++----
 backends/xnnpack/xnnpack_preprocess.py        |  2 +-
 3 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index ca8de0c32df..2bbc62fa588 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -282,12 +282,13 @@ def input_to_nhwc(
                 ChannelsLastTaggedReshapePass.PARTNER_NODE
             ]
         else:
-            # trace back to permute
+            # Need to create NHWC node
             origin = input_node
+            # TODO: safe/correct to always trace back?
+            # Trace back to source node
             while hasattr(origin, "args") and isinstance(origin.args, tuple) and len(origin.args) > 0:
                 origin = origin.args[0]
 
-            # at x choose_qparams and quantize insert permute
             with graph_module.graph.inserting_after(origin):
                 input_node_nhwc = self.create_call_function_node(
                     graph_module=graph_module,
@@ -296,24 +297,17 @@ def input_to_nhwc(
                     memory_format=torch.channels_last,
                 )
 
-            for user in list(origin.users):
-                if user != input_node_nhwc:
-                    user.replace_input_with(origin, input_node_nhwc)
+            # If input_node was not source
+            if origin != input_node:
+                print("Permuted\n\n")
+                # Replace downstream source node with NHWC node
+                for user in list(origin.users):
+                    if user != input_node_nhwc:
+                        user.replace_input_with(origin, input_node_nhwc)
+                graph_module.recompile()
 
-            graph_module.recompile()
             self.mark_as_nhwc_node(input_node_nhwc)
 
-        # TODO: uncomment, use case when permute not needed
-        #     # Need to create NHWC node                     ----------------------------- CONVERSION HAPPENING ----->>
-        #     with graph_module.graph.inserting_after(input_node):
-        #         input_node_nhwc = self.create_call_function_node(
-        #             graph_module=graph_module,
-        #             target=exir_ops.edge.aten._to_copy.default,
-        #             args=(input_node,),
-        #             memory_format=torch.channels_last,
-        #         )
-        #     self.mark_as_nhwc_node(input_node_nhwc)
-
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
             original_input=input_node,
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index dbd9f75ecaf..902bdc5b50b 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -249,15 +249,14 @@ def _test_dq_conv2d(
 
         tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
         tester = tester.quantize(Quantize(quantization_config=quant_config))
-
-        tester.stages["quantize"] = tester.stages[tester.cur]
-
         exported = tester.export()
 
-        tester.stages["export"] = exported.stages[exported.cur]
-
         tester.check(["torch.ops.quantized_decomposed.choose_qparams"])
 
+        tester.stages["export"] = exported.stages[exported.cur]
+        print("\n----------Exported Graph:")
+        print(tester.stages["export"].graph_module.code)
+
         tester.to_edge_transform_and_lower(
             ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
         )
@@ -266,7 +265,6 @@ def _test_dq_conv2d(
         tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"])
 
         tester.to_executorch()
-
         #tester.serialize()
         tester.serialize().dump_artifact("conv2d.pte")
 
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 84cdfd69a48..086eeccbd58 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -144,7 +144,7 @@ def preprocess(
         graph_module = ep.graph_module
 
         node_to_external_map = generate_node_to_external_map(ep, graph_module)
-
+        print("\n----------XNNPack Preprocess Graph:", graph_module)
         # Make sure all inputs are contiguous_format or NCHW or default dim order
         assert_default_dim_order(graph_module)
 

From 2905b984015574ba0dd459a7a32ff92441ae6bce Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Fri, 11 Apr 2025 21:30:55 -0700
Subject: [PATCH 04/24] Corrects input to conv

---
 .../channels_last_tagged_reshape_pass.py      | 29 ++++++++++---------
 backends/xnnpack/test/ops/test_conv2d.py      | 14 ++++-----
 backends/xnnpack/xnnpack_preprocess.py        |  2 +-
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 2bbc62fa588..29de407defd 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -283,31 +283,34 @@ def input_to_nhwc(
             ]
         else:
             # Need to create NHWC node
-            origin = input_node
+            source_node = input_node
+
             # TODO: safe/correct to always trace back?
-            # Trace back to source node
-            while hasattr(origin, "args") and isinstance(origin.args, tuple) and len(origin.args) > 0:
-                origin = origin.args[0]
+            # Trace back to find original source node
+            while (
+                hasattr(source_node, "args")
+                and isinstance(source_node.args, tuple)
+                and len(source_node.args) > 0
+            ):
+                source_node = source_node.args[0]
 
-            with graph_module.graph.inserting_after(origin):
+            with graph_module.graph.inserting_after(source_node):
                 input_node_nhwc = self.create_call_function_node(
                     graph_module=graph_module,
                     target=exir_ops.edge.aten._to_copy.default,
-                    args=(origin,),
+                    args=(source_node,),
                     memory_format=torch.channels_last,
                 )
 
-            # If input_node was not source
-            if origin != input_node:
-                print("Permuted\n\n")
+            # If input_node was not the original source node
+            if source_node != input_node:
+                input_node = source_node
                 # Replace downstream source node with NHWC node
-                for user in list(origin.users):
+                for user in list(input_node.users):
                     if user != input_node_nhwc:
-                        user.replace_input_with(origin, input_node_nhwc)
+                        user.replace_input_with(input_node, input_node_nhwc)
                 graph_module.recompile()
 
-            self.mark_as_nhwc_node(input_node_nhwc)
-
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
             original_input=input_node,
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 902bdc5b50b..64e3d5e53ad 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -249,14 +249,10 @@ def _test_dq_conv2d(
 
         tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
         tester = tester.quantize(Quantize(quantization_config=quant_config))
-        exported = tester.export()
+        tester.export()
 
         tester.check(["torch.ops.quantized_decomposed.choose_qparams"])
 
-        tester.stages["export"] = exported.stages[exported.cur]
-        print("\n----------Exported Graph:")
-        print(tester.stages["export"].graph_module.code)
-
         tester.to_edge_transform_and_lower(
             ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
         )
@@ -265,7 +261,7 @@ def _test_dq_conv2d(
         tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"])
 
         tester.to_executorch()
-        #tester.serialize()
+        # tester.serialize()
         tester.serialize().dump_artifact("conv2d.pte")
 
         tester.run_method_and_compare_outputs(atol=atol)
@@ -751,7 +747,11 @@ def test_dq_conv2d(self) -> None:
         class SimpleConv2d(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(3, 10, 3, )
+                self.conv = torch.nn.Conv2d(
+                    3,
+                    10,
+                    3,
+                )
                 self.conv.weight.requires_grad = False
                 self.conv.bias.requires_grad = False
 
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 086eeccbd58..84cdfd69a48 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -144,7 +144,7 @@ def preprocess(
         graph_module = ep.graph_module
 
         node_to_external_map = generate_node_to_external_map(ep, graph_module)
-        print("\n----------XNNPack Preprocess Graph:", graph_module)
+
         # Make sure all inputs are contiguous_format or NCHW or default dim order
         assert_default_dim_order(graph_module)
 

From 0fef04a2ac4e9d350a8449cab2d9792d64305497 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Sat, 12 Apr 2025 01:03:59 -0700
Subject: [PATCH 05/24] Add is_dequant check for trace back when inserting
 permute

---
 .../channels_last_tagged_reshape_pass.py      | 35 ++++++++++---------
 backends/xnnpack/test/ops/test_conv2d.py      |  2 +-
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 29de407defd..f02a5f816a1 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -283,29 +283,32 @@ def input_to_nhwc(
             ]
         else:
             # Need to create NHWC node
-            source_node = input_node
-
-            # TODO: safe/correct to always trace back?
-            # Trace back to find original source node
-            while (
-                hasattr(source_node, "args")
-                and isinstance(source_node.args, tuple)
-                and len(source_node.args) > 0
-            ):
-                source_node = source_node.args[0]
+            # TODO: Best way to determine if trace back required?
+            is_dequant = (
+                input_node.op == "call_function"
+                and getattr(input_node.target, "__name__", "")
+                == "quantized_decomposed.dequantize_per_tensor.tensor"
+            )
+
+            if is_dequant:
+                # Trace back to find original source node
+                while (
+                    hasattr(input_node, "args")
+                    and isinstance(input_node.args, tuple)
+                    and len(input_node.args) > 0
+                ):
+                    input_node = input_node.args[0]
 
-            with graph_module.graph.inserting_after(source_node):
+            with graph_module.graph.inserting_after(input_node):
                 input_node_nhwc = self.create_call_function_node(
                     graph_module=graph_module,
                     target=exir_ops.edge.aten._to_copy.default,
-                    args=(source_node,),
+                    args=(input_node,),
                     memory_format=torch.channels_last,
                 )
 
-            # If input_node was not the original source node
-            if source_node != input_node:
-                input_node = source_node
-                # Replace downstream source node with NHWC node
+            if is_dequant:
+                # Replace downstream input_nodes with NHWC node
                 for user in list(input_node.users):
                     if user != input_node_nhwc:
                         user.replace_input_with(input_node, input_node_nhwc)
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 64e3d5e53ad..b03108dbb48 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -248,7 +248,7 @@ def _test_dq_conv2d(
         )
 
         tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
-        tester = tester.quantize(Quantize(quantization_config=quant_config))
+        tester.quantize(Quantize(quantization_config=quant_config))
         tester.export()
 
         tester.check(["torch.ops.quantized_decomposed.choose_qparams"])

From f8f998c4f41c745a499b1171bcd4834de273e6c4 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Sat, 12 Apr 2025 01:09:29 -0700
Subject: [PATCH 06/24] Fix node identity check

---
 backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index f02a5f816a1..dd8eea754a0 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -310,7 +310,7 @@ def input_to_nhwc(
             if is_dequant:
                 # Replace downstream input_nodes with NHWC node
                 for user in list(input_node.users):
-                    if user != input_node_nhwc:
+                    if user is not input_node_nhwc:
                         user.replace_input_with(input_node, input_node_nhwc)
                 graph_module.recompile()
 

From 2efe9bbf50daa2eeeb0a8ef1bd8b89b7b5e65e04 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Sun, 13 Apr 2025 14:32:55 -0700
Subject: [PATCH 07/24] Use existing is_dequant check and update atol

---
 .../channels_last_tagged_reshape_pass.py      | 20 ++++++-------------
 backends/xnnpack/test/ops/test_conv2d.py      |  2 +-
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index dd8eea754a0..cbd16f7de08 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -9,10 +9,10 @@
 import torch
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.utils.utils import is_param_node
+from executorch.backends.xnnpack.utils.quant_utils import is_dequant
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 
-
 # TODO(T151254305) use subgraph_rewriter
 class ChannelsLastTaggedReshapePass(XNNPACKPass):
     """
@@ -283,20 +283,12 @@ def input_to_nhwc(
             ]
         else:
             # Need to create NHWC node
-            # TODO: Best way to determine if trace back required?
-            is_dequant = (
-                input_node.op == "call_function"
-                and getattr(input_node.target, "__name__", "")
-                == "quantized_decomposed.dequantize_per_tensor.tensor"
-            )
+            # TODO: If input is dequant does that it's from dynamic quantization?
+            input_is_dequant = is_dequant(input_node)
 
-            if is_dequant:
+            if input_is_dequant:
                 # Trace back to find original source node
-                while (
-                    hasattr(input_node, "args")
-                    and isinstance(input_node.args, tuple)
-                    and len(input_node.args) > 0
-                ):
+                while getattr(input_node, "args", None):
                     input_node = input_node.args[0]
 
             with graph_module.graph.inserting_after(input_node):
@@ -307,7 +299,7 @@ def input_to_nhwc(
                     memory_format=torch.channels_last,
                 )
 
-            if is_dequant:
+            if input_is_dequant:
                 # Replace downstream input_nodes with NHWC node
                 for user in list(input_node.users):
                     if user is not input_node_nhwc:
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index b03108dbb48..77e1f0b3f89 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -766,5 +766,5 @@ def get_inputs(self):
             model,
             model.get_inputs(),
             dynamic_shapes=None,
-            atol=5e-2,
+            atol=3.0,
         )

From 3762e0d7bf70e22a7d0157053cf4a8e2adc8a6c0 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Mon, 14 Apr 2025 19:35:25 -0700
Subject: [PATCH 08/24] Implement replace_all_uses_with function

---
 .../channels_last_tagged_reshape_pass.py      |   8 +-
 backends/xnnpack/test/ops/test_conv2d.py      |   6 +-
 .../spinquant/third-party/FFHT/Makefile       | 236 +++++++++++++++++-
 3 files changed, 230 insertions(+), 20 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index cbd16f7de08..80d47b4630e 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -283,7 +283,7 @@ def input_to_nhwc(
             ]
         else:
             # Need to create NHWC node
-            # TODO: If input is dequant does that it's from dynamic quantization?
+            # TODO: Replace with check to determine if dynamic quant
             input_is_dequant = is_dequant(input_node)
 
             if input_is_dequant:
@@ -301,10 +301,8 @@ def input_to_nhwc(
 
             if input_is_dequant:
                 # Replace downstream input_nodes with NHWC node
-                for user in list(input_node.users):
-                    if user is not input_node_nhwc:
-                        user.replace_input_with(input_node, input_node_nhwc)
-                graph_module.recompile()
+                input_node.replace_all_uses_with(input_node_nhwc)
+                input_node_nhwc.args = (input_node,)
 
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 77e1f0b3f89..a33c9989770 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -244,7 +244,7 @@ def _test_dq_conv2d(
 
         DynamicallyQuantizedPartitioner = XnnpackPartitioner(
             config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
-            per_op_mode=False,
+            per_op_mode=True,
         )
 
         tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
@@ -762,9 +762,11 @@ def get_inputs(self):
                 return (torch.randn(1, 3, 8, 8),)
 
         model = SimpleConv2d()
+        inputs = model.get_inputs()
+
         self._test_dq_conv2d(
             model,
-            model.get_inputs(),
+            inputs,
             dynamic_shapes=None,
             atol=3.0,
         )
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
index 7cbeb3ddae9..b06ca7909ae 100644
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
@@ -1,21 +1,231 @@
-CC = gcc
-CFLAGS = -O3 -march=native -std=c99 -pedantic -Wall -Wextra -Wshadow -Wpointer-arith -Wcast-qual -Wstrict-prototypes -Wmissing-prototypes
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.31
 
-all: test_float test_double fast_copy.o fht.o
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
 
-OBJ := dumb_fht.o fast_copy.o fht.o
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
 
-%.o: %.c
-	$(CC) $< -o $@ -c $(CFLAGS)
+#=============================================================================
+# Special targets provided by cmake.
 
-test_%: test_%.c $(OBJ)
-	$(CC) $< $(OBJ) -o $@ $(CFLAGS)
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
 
-test_double_header_only: test_double_header_only.c
-	$(CC) $< -o $@ $(CFLAGS)
+# Disable VCS-based implicit rules.
+% : %,v
 
-test_float_header_only: test_double_header_only.c
-	$(CC) $< -o $@ $(CFLAGS)
+# Disable VCS-based implicit rules.
+% : RCS/%
 
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake
+
+# The command to remove a file.
+RM = /opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /Users/zuby/PycharmProjects/src/executorch
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /Users/zuby/PycharmProjects/src/executorch
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Running CMake cache editor..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Running CMake to regenerate build system..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Install the project..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Install the project..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing only the local directory..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing only the local directory..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing the project stripped..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing the project stripped..."
+	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /Users/zuby/PycharmProjects/src/executorch && $(CMAKE_COMMAND) -E cmake_progress_start /Users/zuby/PycharmProjects/src/executorch/CMakeFiles /Users/zuby/PycharmProjects/src/executorch/extension/llm/custom_ops/spinquant/third-party/FFHT//CMakeFiles/progress.marks
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /Users/zuby/PycharmProjects/src/executorch/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
 clean:
-	rm -f test_float test_double test_float_header_only test_double_header_only $(OBJ)
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule
+.PHONY : extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule
+
+# Convenience name for target.
+dumb_fht: extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule
+.PHONY : dumb_fht
+
+# fast build rule for target.
+dumb_fht/fast:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build
+.PHONY : dumb_fht/fast
+
+dumb_fht.o: dumb_fht.c.o
+.PHONY : dumb_fht.o
+
+# target to build an object file
+dumb_fht.c.o:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/dumb_fht.c.o
+.PHONY : dumb_fht.c.o
+
+dumb_fht.i: dumb_fht.c.i
+.PHONY : dumb_fht.i
+
+# target to preprocess a source file
+dumb_fht.c.i:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/dumb_fht.c.i
+.PHONY : dumb_fht.c.i
+
+dumb_fht.s: dumb_fht.c.s
+.PHONY : dumb_fht.s
+
+# target to generate assembly for a file
+dumb_fht.c.s:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/dumb_fht.c.s
+.PHONY : dumb_fht.c.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... dumb_fht"
+	@echo "... dumb_fht.o"
+	@echo "... dumb_fht.i"
+	@echo "... dumb_fht.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /Users/zuby/PycharmProjects/src/executorch && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+

From 4112c6a11fe2e2f9a7040072434732afc435c3d9 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Mon, 14 Apr 2025 19:38:22 -0700
Subject: [PATCH 09/24] Remove cmake file

---
 .../spinquant/third-party/FFHT/Makefile       | 236 +-----------------
 1 file changed, 13 insertions(+), 223 deletions(-)

diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
index b06ca7909ae..7cbeb3ddae9 100644
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
@@ -1,231 +1,21 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.31
+CC = gcc
+CFLAGS = -O3 -march=native -std=c99 -pedantic -Wall -Wextra -Wshadow -Wpointer-arith -Wcast-qual -Wstrict-prototypes -Wmissing-prototypes
 
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
+all: test_float test_double fast_copy.o fht.o
 
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
+OBJ := dumb_fht.o fast_copy.o fht.o
 
-#=============================================================================
-# Special targets provided by cmake.
+%.o: %.c
+	$(CC) $< -o $@ -c $(CFLAGS)
 
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
+test_%: test_%.c $(OBJ)
+	$(CC) $< $(OBJ) -o $@ $(CFLAGS)
 
-# Disable VCS-based implicit rules.
-% : %,v
+test_double_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
 
-# Disable VCS-based implicit rules.
-% : RCS/%
+test_float_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
 
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake
-
-# The command to remove a file.
-RM = /opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /Users/zuby/PycharmProjects/src/executorch
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /Users/zuby/PycharmProjects/src/executorch
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Running CMake cache editor..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Running CMake to regenerate build system..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Install the project..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Install the project..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing only the local directory..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing only the local directory..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing the project stripped..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color "--switch=$(COLOR)" --cyan "Installing the project stripped..."
-	/opt/anaconda3/envs/executorch/lib/python3.10/site-packages/cmake/data/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /Users/zuby/PycharmProjects/src/executorch && $(CMAKE_COMMAND) -E cmake_progress_start /Users/zuby/PycharmProjects/src/executorch/CMakeFiles /Users/zuby/PycharmProjects/src/executorch/extension/llm/custom_ops/spinquant/third-party/FFHT//CMakeFiles/progress.marks
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /Users/zuby/PycharmProjects/src/executorch/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
 clean:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule
-.PHONY : extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule
-
-# Convenience name for target.
-dumb_fht: extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/rule
-.PHONY : dumb_fht
-
-# fast build rule for target.
-dumb_fht/fast:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build
-.PHONY : dumb_fht/fast
-
-dumb_fht.o: dumb_fht.c.o
-.PHONY : dumb_fht.o
-
-# target to build an object file
-dumb_fht.c.o:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/dumb_fht.c.o
-.PHONY : dumb_fht.c.o
-
-dumb_fht.i: dumb_fht.c.i
-.PHONY : dumb_fht.i
-
-# target to preprocess a source file
-dumb_fht.c.i:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/dumb_fht.c.i
-.PHONY : dumb_fht.c.i
-
-dumb_fht.s: dumb_fht.c.s
-.PHONY : dumb_fht.s
-
-# target to generate assembly for a file
-dumb_fht.c.s:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(MAKE) $(MAKESILENT) -f extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/build.make extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeFiles/dumb_fht.dir/dumb_fht.c.s
-.PHONY : dumb_fht.c.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... dumb_fht"
-	@echo "... dumb_fht.o"
-	@echo "... dumb_fht.i"
-	@echo "... dumb_fht.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /Users/zuby/PycharmProjects/src/executorch && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
+	rm -f test_float test_double test_float_header_only test_double_header_only $(OBJ)

From cdd6f2df68908350fca0f0eabdfcdaefa02bb227 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Tue, 15 Apr 2025 18:31:05 -0700
Subject: [PATCH 10/24] Restore original supported conv2d operators

---
 backends/xnnpack/quantizer/xnnpack_quantizer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer.py b/backends/xnnpack/quantizer/xnnpack_quantizer.py
index 9e24d7b0030..fdabd0383e6 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer.py
@@ -71,10 +71,8 @@ def _supported_symmetric_quantized_operators() -> dict[str, list[OperatorPattern
         "conv2d": [
             [torch.nn.Conv2d, torch.nn.ReLU],
             [torch.nn.Conv2d, F.relu],
-            [torch.nn.Conv2d],
             [F.conv2d, torch.nn.ReLU],
             [F.conv2d, F.relu],
-            [F.conv2d],
         ],
         "linear": [[torch.nn.Linear], [F.linear]],
         "add": [[torch.add]],

From 7150872bdec7102257366e246359f7610ccdefe1 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Tue, 15 Apr 2025 18:34:05 -0700
Subject: [PATCH 11/24] Add dynamic quant check before NHWC permute

---
 .../_passes/channels_last_tagged_reshape_pass.py     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 80d47b4630e..1a52827418d 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -9,7 +9,7 @@
 import torch
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.backends.xnnpack.utils.utils import is_param_node
-from executorch.backends.xnnpack.utils.quant_utils import is_dequant
+from executorch.backends.xnnpack.utils.quant_utils import is_dynamic_qdq
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 
@@ -283,11 +283,11 @@ def input_to_nhwc(
             ]
         else:
             # Need to create NHWC node
-            # TODO: Replace with check to determine if dynamic quant
-            input_is_dequant = is_dequant(input_node)
+            # Check if input uses dynamic quantization
+            is_dynamic_input = is_dynamic_qdq(input_node)
 
-            if input_is_dequant:
-                # Trace back to find original source node
+            if is_dynamic_input:
+                # Trace back to original source node
                 while getattr(input_node, "args", None):
                     input_node = input_node.args[0]
 
@@ -299,7 +299,7 @@ def input_to_nhwc(
                     memory_format=torch.channels_last,
                 )
 
-            if input_is_dequant:
+            if is_dynamic_input:
                 # Replace downstream input_nodes with NHWC node
                 input_node.replace_all_uses_with(input_node_nhwc)
                 input_node_nhwc.args = (input_node,)

From 6b44c4bbec825cd4a1626e8c5c8336dcd00407d9 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Tue, 15 Apr 2025 19:24:51 -0700
Subject: [PATCH 12/24] Refactor dq conv2d test

---
 backends/xnnpack/test/ops/test_conv2d.py | 55 +++++++++---------------
 1 file changed, 21 insertions(+), 34 deletions(-)

diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index a33c9989770..20a668ed623 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -176,6 +176,20 @@ def get_inputs(self):
         return (torch.randn(2, 2, 4, 4),)
 
 
+class DQConv2d(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 10, 3)
+        self.conv.weight.requires_grad = False
+        self.conv.bias.requires_grad = False
+
+    def forward(self, x):
+        return self.conv(x)
+
+    def get_inputs(self):
+        return (torch.randn(1, 3, 8, 8),)
+
+
 class TestConv2d(unittest.TestCase):
     def setUp(self):
         torch._dynamo.reset()
@@ -230,12 +244,11 @@ def _test(
                     .run_method_and_compare_outputs(qtol=1)
                 )
 
-    def _test_dq_conv2d(
+    def _test_dq(
         self,
         m: torch.nn.Module,
         inputs,
         dynamic_shapes,
-        atol=5e-02,
     ):
         quant_config = get_symmetric_quantization_config(
             is_per_channel=True,
@@ -250,21 +263,15 @@ def _test_dq_conv2d(
         tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
         tester.quantize(Quantize(quantization_config=quant_config))
         tester.export()
-
         tester.check(["torch.ops.quantized_decomposed.choose_qparams"])
-
         tester.to_edge_transform_and_lower(
             ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
         )
-
         tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
         tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"])
-
         tester.to_executorch()
-        # tester.serialize()
-        tester.serialize().dump_artifact("conv2d.pte")
-
-        tester.run_method_and_compare_outputs(atol=atol)
+        tester.serialize()
+        tester.run_method_and_compare_outputs(qtol=1)
 
     def test_fp16_conv2d(self) -> None:
         for transpose in (True, False):
@@ -743,30 +750,10 @@ def forward(self, x):
             .run_method_and_compare_outputs(qtol=1)
         )
 
-    def test_dq_conv2d(self) -> None:
-        class SimpleConv2d(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(
-                    3,
-                    10,
-                    3,
-                )
-                self.conv.weight.requires_grad = False
-                self.conv.bias.requires_grad = False
-
-            def forward(self, x):
-                return self.conv(x)
-
-            def get_inputs(self):
-                return (torch.randn(1, 3, 8, 8),)
-
-        model = SimpleConv2d()
-        inputs = model.get_inputs()
-
-        self._test_dq_conv2d(
+    def test_qs8_dq_conv2d(self) -> None:
+        model = DQConv2d()
+        self._test_dq(
             model,
-            inputs,
+            model.get_inputs(),
             dynamic_shapes=None,
-            atol=3.0,
         )

From 7054f2ef59375878a4edbbfb4cbfbf808f11c570 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Tue, 15 Apr 2025 19:31:54 -0700
Subject: [PATCH 13/24] Revert formatting

---
 backends/xnnpack/quantizer/xnnpack_quantizer_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index 91babc26cc9..ce459806c6e 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -304,7 +304,6 @@ def _do_annotate_conv(
     for n in gm.graph.nodes:
         if not is_conv_node(n):
             continue
-
         conv_node = n
 
         # This is hacky!

From fc48e03b088399e7e80f58847f8ff2990e8b3b84 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Tue, 15 Apr 2025 20:21:51 -0700
Subject: [PATCH 14/24] Add check to only annotate dq conv2d

---
 .../xnnpack/quantizer/xnnpack_quantizer_utils.py    | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index ce459806c6e..2c2c074815e 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -42,6 +42,8 @@
     "propagate_annotation",
 ]
 
+from pytorch.ao.test.dtypes.test_bitpacking import dimensions
+
 
 # In the absence of better name, just winging it with QuantizationConfig
 @dataclass(eq=True, frozen=True)
@@ -323,6 +325,17 @@ def _do_annotate_conv(
         assert isinstance(weight, Node)
         input_qspec_map[weight] = get_weight_qspec(quantization_config)
 
+        # Only annotate dynamically quantized conv if it's 2D
+        if (
+            quantization_config
+            and quantization_config.input_activation
+            and quantization_config.input_activation.is_dynamic
+        ):
+            weight_val = weight.meta.get("val", None)
+            weight_shape = getattr(weight_val, "shape", None)
+            if weight_shape is not None and len(weight_shape) != 4:
+                continue
+
         # adding weight node to the partition as well
         partition = [conv_node, conv_node.args[1]]
 

From 84b3634cfaab3859463e067e7f0b8b902b8db153 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Tue, 15 Apr 2025 20:24:35 -0700
Subject: [PATCH 15/24] Remove unused import

---
 backends/xnnpack/quantizer/xnnpack_quantizer_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index 2c2c074815e..f400f51a464 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -42,8 +42,6 @@
     "propagate_annotation",
 ]
 
-from pytorch.ao.test.dtypes.test_bitpacking import dimensions
-
 
 # In the absence of better name, just winging it with QuantizationConfig
 @dataclass(eq=True, frozen=True)

From 62e30e574cff026a7ac043c89a697f74af809242 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Wed, 16 Apr 2025 15:15:24 -0700
Subject: [PATCH 16/24] Add computation for non-batch dims; remove non-batch
 dims check

---
 backends/xnnpack/operators/quant_params.py | 7 ++++++-
 backends/xnnpack/runtime/XNNCompiler.cpp   | 5 -----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
index e695b151560..ddb9db865f3 100644
--- a/backends/xnnpack/operators/quant_params.py
+++ b/backends/xnnpack/operators/quant_params.py
@@ -145,9 +145,14 @@ def quantize_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
     def _from_dynamic_input_node(cls, quant_node: torch.fx.Node) -> QuantParams:
         q_input = quant_node.args[0]  # fp32 input
         assert isinstance(q_input, torch.fx.Node)
-        # TODO - materialize this from the quant_node scale count and val shape
         num_nonbatch_dims = 1
 
+        # Compute non-batch dimensions (shape length - 1), defaulting to 1
+        q_input_val = q_input.meta.get("val", None)
+        q_input_shape = getattr(q_input_val, "shape", None)
+        if q_input_shape is not None:
+            num_nonbatch_dims = max(len(q_input_shape) - 1, 1)
+
         return cls(
             per_channel=False,  # True is not valid
             q_input=q_input,
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 988dab86ab7..0b187d05df0 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -512,11 +512,6 @@ Error defineTensor(
             buffer_ptr == nullptr,
             Internal,
             "Dynamically quantized tensor should not have constant data but found non-nullptr");
-        // TODO(T179441835): Dynamic Quantization with num_nonbatch_dims > 1
-        ET_CHECK_OR_RETURN_ERROR(
-            qparams->num_nonbatch_dims() == 1,
-            Internal,
-            "Dynamically Quantized Tensors currently only support per token quantization");
         status = xnn_define_dynamically_quantized_tensor_value(
             /*subgraph=*/subgraph_ptr,
             /*datatype=*/getDataType(tensor_value->datatype()),

From 3c7fe328af6c28c3af70c5c41784f098bf7f4706 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Wed, 16 Apr 2025 15:21:21 -0700
Subject: [PATCH 17/24] Refactor test and imports

---
 .../_passes/channels_last_tagged_reshape_pass.py      |  3 ++-
 backends/xnnpack/test/ops/test_conv2d.py              | 11 ++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 1a52827418d..768df1f4f04 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -8,11 +8,12 @@
 
 import torch
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
-from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.backends.xnnpack.utils.quant_utils import is_dynamic_qdq
+from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 
+
 # TODO(T151254305) use subgraph_rewriter
 class ChannelsLastTaggedReshapePass(XNNPACKPass):
     """
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 20a668ed623..0c456ebc21a 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -30,10 +30,7 @@
 )
 from executorch.backends.xnnpack.test.test_xnnpack_utils import randomize_bn
 from executorch.backends.xnnpack.test.tester import Quantize, Tester
-from executorch.backends.xnnpack.test.tester.tester import (
-    Partition,
-    ToEdgeTransformAndLower,
-)
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -176,7 +173,7 @@ def get_inputs(self):
         return (torch.randn(2, 2, 4, 4),)
 
 
-class DQConv2d(torch.nn.Module):
+class Conv2dDynamicQuant(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.conv = torch.nn.Conv2d(3, 10, 3)
@@ -750,8 +747,8 @@ def forward(self, x):
             .run_method_and_compare_outputs(qtol=1)
         )
 
-    def test_qs8_dq_conv2d(self) -> None:
-        model = DQConv2d()
+    def test_dq_conv2d(self) -> None:
+        model = Conv2dDynamicQuant()
         self._test_dq(
             model,
             model.get_inputs(),

From 064671b2c5c5b97804b3042d3eb629e181291d2a Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Wed, 16 Apr 2025 15:35:32 -0700
Subject: [PATCH 18/24] Update comments

---
 backends/xnnpack/quantizer/xnnpack_quantizer_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index f400f51a464..92eff3d0d68 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -331,6 +331,8 @@ def _do_annotate_conv(
         ):
             weight_val = weight.meta.get("val", None)
             weight_shape = getattr(weight_val, "shape", None)
+
+            # Skip if not a 4D weight tensor (i.e. not conv2d)
             if weight_shape is not None and len(weight_shape) != 4:
                 continue
 

From b29030ed21e921424adeb1da25f42243b0a7827a Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Sun, 20 Apr 2025 09:52:00 -0700
Subject: [PATCH 19/24] Add unit tests for dynamic quant sequential and
 parallel convs

---
 backends/xnnpack/test/ops/test_conv2d.py | 69 +++++++++++++++++++-----
 1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 0c456ebc21a..4d1b387b8ec 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -173,12 +173,10 @@ def get_inputs(self):
         return (torch.randn(2, 2, 4, 4),)
 
 
-class Conv2dDynamicQuant(torch.nn.Module):
+class Conv2dDQ(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.conv = torch.nn.Conv2d(3, 10, 3)
-        self.conv.weight.requires_grad = False
-        self.conv.bias.requires_grad = False
+        self.conv = torch.nn.Conv2d(in_channels=3, out_channels=10, kernel_size=3)
 
     def forward(self, x):
         return self.conv(x)
@@ -187,6 +185,43 @@ def get_inputs(self):
         return (torch.randn(1, 3, 8, 8),)
 
 
+class Conv2dDQSeq(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.first = torch.nn.Conv2d(
+            in_channels=3, out_channels=8, kernel_size=3, padding=1
+        )
+        self.second = torch.nn.Conv2d(
+            in_channels=8, out_channels=10, kernel_size=3, padding=1
+        )
+
+    def forward(self, x):
+        y = self.first(x)
+        return self.second(y)
+
+    def get_inputs(self):
+        return (torch.randn(1, 3, 8, 8),)
+
+
+class Conv2dDQParallel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.first = torch.nn.Conv2d(
+            in_channels=3, out_channels=8, kernel_size=3, padding=1
+        )
+        self.second = torch.nn.Conv2d(
+            in_channels=3, out_channels=10, kernel_size=3, padding=1
+        )
+
+    def forward(self, x):
+        first = self.first(x)
+        second = self.second(x)
+        return first, second
+
+    def get_inputs(self):
+        return (torch.randn(1, 3, 8, 8),)
+
+
 class TestConv2d(unittest.TestCase):
     def setUp(self):
         torch._dynamo.reset()
@@ -244,8 +279,8 @@ def _test(
     def _test_dq(
         self,
         m: torch.nn.Module,
-        inputs,
-        dynamic_shapes,
+        conv_count=1,
+        dynamic_shapes=None,
     ):
         quant_config = get_symmetric_quantization_config(
             is_per_channel=True,
@@ -257,14 +292,16 @@ def _test_dq(
             per_op_mode=True,
         )
 
-        tester = Tester(m, inputs, dynamic_shapes=dynamic_shapes)
+        tester = Tester(m, m.get_inputs(), dynamic_shapes=dynamic_shapes)
         tester.quantize(Quantize(quantization_config=quant_config))
         tester.export()
         tester.check(["torch.ops.quantized_decomposed.choose_qparams"])
         tester.to_edge_transform_and_lower(
             ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
         )
-        tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+        tester.check_count(
+            {"torch.ops.higher_order.executorch_call_delegate": conv_count}
+        )
         tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"])
         tester.to_executorch()
         tester.serialize()
@@ -748,9 +785,13 @@ def forward(self, x):
         )
 
     def test_dq_conv2d(self) -> None:
-        model = Conv2dDynamicQuant()
-        self._test_dq(
-            model,
-            model.get_inputs(),
-            dynamic_shapes=None,
-        )
+        model = Conv2dDQ()
+        self._test_dq(model)
+
+    def test_dq_conv2d_seq(self) -> None:
+        model = Conv2dDQSeq()
+        self._test_dq(model, conv_count=2)
+
+    def test_dq_conv2d_parallel(self) -> None:
+        model = Conv2dDQParallel()
+        self._test_dq(model, conv_count=2)

From 6da8b7d5414e852d0cb229cae797f3ded0e77bc7 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Sun, 20 Apr 2025 09:55:36 -0700
Subject: [PATCH 20/24] Add unit test for dynamic quant conv2d with
 channels-last permute

---
 .../test_channels_last_tagged_reshape.py      | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
index 6d60f9d76b5..a00209f4ea6 100644
--- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
+++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -10,10 +10,13 @@
 from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+)
 from executorch.backends.xnnpack.test.test_xnnpack_utils_classes import (
     OpSequencesAddConv2d,
 )
-from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+from executorch.backends.xnnpack.test.tester import Quantize, RunPasses, Tester
 
 
 class TestChannelsLastTaggedReshapePass(unittest.TestCase):
@@ -35,6 +38,10 @@ def setUp(self):
     dequant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default"
     conv_name = "executorch_exir_dialects_edge__ops_aten_convolution_default"
     relu_name = "executorch_exir_dialects_edge__ops_aten_relu_default"
+    choose_qparams_name = (
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_tensor"
+    )
+    dynamic_quant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_tensor"
 
     def test_fp32_channels_last_tagged_reshape_pass(self):
         for module, num_reshape in self.modules.items():
@@ -179,3 +186,37 @@ def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self):
             )
             .run_method_and_compare_outputs()
         )
+
+    class Conv2dDynamicQuant(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 10, 3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None:
+        (
+            Tester(self.Conv2dDynamicQuant().eval(), (torch.randn(1, 3, 8, 8),))
+            .quantize(
+                Quantize(
+                    quantization_config=get_symmetric_quantization_config(
+                        is_dynamic=True
+                    )
+                )
+            )
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check(
+                [
+                    self.to_copy_name,
+                    self.choose_qparams_name,
+                    self.dynamic_quant_name,
+                    self.dequant_name,
+                    self.conv_name,
+                    self.to_copy_name,
+                ]
+            )
+            .run_method_and_compare_outputs()
+        )

From 7c534545344ed2b831e163ca0f4d207e253a7a23 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Sun, 20 Apr 2025 09:57:54 -0700
Subject: [PATCH 21/24] Add check to determine if node feeds into conv and set
 non-batch dims accordingly

---
 backends/xnnpack/operators/quant_params.py | 24 +++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
index ddb9db865f3..fbee1d192cf 100644
--- a/backends/xnnpack/operators/quant_params.py
+++ b/backends/xnnpack/operators/quant_params.py
@@ -141,17 +141,27 @@ def quantize_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
                 tensor, self.scale, self.zp, self.qmin, self.qmax, self.dtype
             )
 
+    # Temporary helper until non-batch dimensions can be inferred
+    # Detects if a node feeds into a conv op by checking all downstream users
+    @staticmethod
+    def _feeds_into_conv(node: torch.fx.Node) -> bool:
+        users_list = [node]
+
+        while users_list:
+            current_user = users_list.pop()
+            if "convolution" in str(current_user.target):
+                return True
+            users_list.extend(current_user.users)
+
+        return False
+
     @classmethod
     def _from_dynamic_input_node(cls, quant_node: torch.fx.Node) -> QuantParams:
         q_input = quant_node.args[0]  # fp32 input
         assert isinstance(q_input, torch.fx.Node)
-        num_nonbatch_dims = 1
-
-        # Compute non-batch dimensions (shape length - 1), defaulting to 1
-        q_input_val = q_input.meta.get("val", None)
-        q_input_shape = getattr(q_input_val, "shape", None)
-        if q_input_shape is not None:
-            num_nonbatch_dims = max(len(q_input_shape) - 1, 1)
+        # TODO - materialize this from the quant_node scale count and val shape
+        # Set non-batch dims to 3 if node feeds into conv (only 2D is supported), otherwise set to 1 for linear
+        num_nonbatch_dims = 3 if cls._feeds_into_conv(quant_node) else 1
 
         return cls(
             per_channel=False,  # True is not valid

From eaba81962d63390e4a24abe6bb3471443e425e0c Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Sun, 20 Apr 2025 10:01:43 -0700
Subject: [PATCH 22/24] Add depthwise conv checks for dynamic quant

---
 .../xnnpack/partition/config/gemm_configs.py  | 34 ++++++++++++++-----
 .../quantizer/xnnpack_quantizer_utils.py      | 18 +++++++++-
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index a05bf623e05..b779f2e6204 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -9,6 +9,7 @@
 from typing import cast, List, Optional, Tuple
 
 import torch
+from executorch.backends.transforms import get_shape
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
 from executorch.backends.xnnpack.partition.config.xnnpack_config import (
     ConfigPrecisionType,
@@ -358,18 +359,35 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             why(node, "Only support 1D + 2D Conv")
             return False  # Only support 1D + 2D Conv
 
-        precision = self._detect_precision(node)
-        if precision == ConfigPrecisionType.DYNAMIC_QUANT and len(conv_stride) != 2:
-            why(node, "Only support 2D Conv for dynamic quantization")
-            return False
-
         kernel_node = get_input_node(node, 1)
+        kernel_shape = get_shape(kernel_node)
         weight_quant_params = QuantParams.from_weights(kernel_node, ep)
-
-        is_transpose = node.args[6]
         groups = cast(int, node.args[8])
+        is_transpose = node.args[6]
+
+        if is_transpose:
+            group_input_channels = int(kernel_shape[0] / groups)
+            group_output_channels = kernel_shape[1]
+        else:
+            group_input_channels = kernel_shape[1]
+            group_output_channels = int(kernel_shape[0] / groups)
+
+        is_depthwise = (
+            group_input_channels == 1
+            and group_output_channels % group_input_channels == 0
+        )
+
+        # XNNPACK does not support dynamic quantization convs that are not 2D or are depthwise
+        if self._detect_precision(node) == ConfigPrecisionType.DYNAMIC_QUANT and (
+            len(conv_stride) != 2 or is_depthwise
+        ):
+            why(
+                node,
+                "XNNPACK only supports standard 2D convolutions for dynamic quantization",
+            )
+            return False
 
-        # XNNPack does not support non-zero output padding in transposed
+        # XNNPACK does not support non-zero output padding in transposed
         # convolutions.
         if is_transpose and any(
             out_pad != 0 for out_pad in cast(List[int], node.args[7])
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index 92eff3d0d68..91cb816b795 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -323,7 +323,7 @@ def _do_annotate_conv(
         assert isinstance(weight, Node)
         input_qspec_map[weight] = get_weight_qspec(quantization_config)
 
-        # Only annotate dynamically quantized conv if it's 2D
+        # Only annotate dynamically quantized conv if it's 2D and not depthwise
         if (
             quantization_config
             and quantization_config.input_activation
@@ -336,6 +336,22 @@ def _do_annotate_conv(
             if weight_shape is not None and len(weight_shape) != 4:
                 continue
 
+            # Default to 1 since groups is not available in the node
+            groups = 1
+            if is_conv_transpose:
+                group_input_channels = int(weight_shape[0] / groups)
+                group_output_channels = weight_shape[1]
+            else:
+                group_input_channels = weight_shape[1]
+                group_output_channels = int(weight_shape[0] / groups)
+
+            # Skip if depthwise
+            if (
+                group_input_channels == 1
+                and group_output_channels % group_input_channels == 0
+            ):
+                continue
+
         # adding weight node to the partition as well
         partition = [conv_node, conv_node.args[1]]
 

From e336df6f8422d6be82f1458fa331af4613eeeaf0 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Mon, 21 Apr 2025 16:38:56 -0700
Subject: [PATCH 23/24] Move depthwise conv check to helper function in utils

---
 .../xnnpack/partition/config/gemm_configs.py  | 16 ++--------
 .../quantizer/xnnpack_quantizer_utils.py      | 18 ++---------
 backends/xnnpack/utils/utils.py               | 30 +++++++++++++++++++
 3 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index b779f2e6204..67bccbc52d1 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -28,6 +28,7 @@
 )
 from executorch.backends.xnnpack.utils.utils import (
     get_input_node,
+    is_depthwise_conv,
     is_getitem,
     is_node,
     is_param_node,
@@ -365,21 +366,10 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         groups = cast(int, node.args[8])
         is_transpose = node.args[6]
 
-        if is_transpose:
-            group_input_channels = int(kernel_shape[0] / groups)
-            group_output_channels = kernel_shape[1]
-        else:
-            group_input_channels = kernel_shape[1]
-            group_output_channels = int(kernel_shape[0] / groups)
-
-        is_depthwise = (
-            group_input_channels == 1
-            and group_output_channels % group_input_channels == 0
-        )
-
         # XNNPACK does not support dynamic quantization convs that are not 2D or are depthwise
         if self._detect_precision(node) == ConfigPrecisionType.DYNAMIC_QUANT and (
-            len(conv_stride) != 2 or is_depthwise
+            len(conv_stride) != 2
+            or is_depthwise_conv(kernel_shape, groups, is_transpose)
         ):
             why(
                 node,
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index 91cb816b795..4b961bef81d 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.nn.functional as F
+from executorch.backends.xnnpack.utils.utils import is_depthwise_conv
 from torch._subclasses import FakeTensor
 from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
 from torch.ao.quantization.pt2e.export_utils import _WrapperModule
@@ -29,7 +30,6 @@
 )
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
-
 __all__ = [
     "OperatorConfig",
     "OperatorPatternType",
@@ -336,20 +336,8 @@ def _do_annotate_conv(
             if weight_shape is not None and len(weight_shape) != 4:
                 continue
 
-            # Default to 1 since groups is not available in the node
-            groups = 1
-            if is_conv_transpose:
-                group_input_channels = int(weight_shape[0] / groups)
-                group_output_channels = weight_shape[1]
-            else:
-                group_input_channels = weight_shape[1]
-                group_output_channels = int(weight_shape[0] / groups)
-
-            # Skip if depthwise
-            if (
-                group_input_channels == 1
-                and group_output_channels % group_input_channels == 0
-            ):
+            # Skip if depthwise (default to groups=1 since it's not an arg)
+            if is_depthwise_conv(weight_shape, 1, is_conv_transpose):
                 continue
 
         # adding weight node to the partition as well
diff --git a/backends/xnnpack/utils/utils.py b/backends/xnnpack/utils/utils.py
index fab95618807..b23fd444117 100644
--- a/backends/xnnpack/utils/utils.py
+++ b/backends/xnnpack/utils/utils.py
@@ -158,3 +158,33 @@ def get_source_fn(node: torch.fx.Node) -> Optional[torch.fx.Node]:
         return None
     source_fn = source_fn_st[-1]
     return source_fn[1]
+
+
+def is_depthwise_conv(
+    kernel_shape: Tuple[int, ...], groups: int = 1, is_transpose: bool = False
+) -> bool:
+    """
+    A convolution is depthwise if:
+        1) groups = input_channels (i.e. group_input_channels = 1)
+        2) output_channels is a positive integer multiple of input channels
+
+    For standard convolutions:
+        weight shape = (out_channels, in_channels_per_group, height, width)
+    For transposed convolutions:
+        weight shape = (in_channels, out_channels_per_group, height, width)
+
+    Returns True if the convolution is depthwise
+    """
+    if len(kernel_shape) < 2 or groups < 1:
+        return False
+
+    if is_transpose:
+        group_input_channels = int(kernel_shape[0] / groups)
+        group_output_channels = kernel_shape[1]
+    else:
+        group_input_channels = kernel_shape[1]
+        group_output_channels = int(kernel_shape[0] / groups)
+
+    return (
+        group_input_channels == 1 and group_output_channels % group_input_channels == 0
+    )

From d82e08071cc5c7df51372b12db6b92778135f331 Mon Sep 17 00:00:00 2001
From: Zuby Afzal <git.zuby@gmail.com>
Date: Mon, 21 Apr 2025 16:40:22 -0700
Subject: [PATCH 24/24] Use existing Conv2d class; get conv count from model

---
 backends/xnnpack/test/ops/test_conv2d.py | 31 ++++++++++++------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 4d1b387b8ec..92bb03c907a 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -173,18 +173,6 @@ def get_inputs(self):
         return (torch.randn(2, 2, 4, 4),)
 
 
-class Conv2dDQ(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(in_channels=3, out_channels=10, kernel_size=3)
-
-    def forward(self, x):
-        return self.conv(x)
-
-    def get_inputs(self):
-        return (torch.randn(1, 3, 8, 8),)
-
-
 class Conv2dDQSeq(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -210,7 +198,7 @@ def __init__(self):
             in_channels=3, out_channels=8, kernel_size=3, padding=1
         )
         self.second = torch.nn.Conv2d(
-            in_channels=3, out_channels=10, kernel_size=3, padding=1
+            in_channels=3, out_channels=8, kernel_size=3, padding=1
         )
 
     def forward(self, x):
@@ -785,13 +773,24 @@ def forward(self, x):
         )
 
     def test_dq_conv2d(self) -> None:
-        model = Conv2dDQ()
+        model = Conv2d(
+            in_channels=3,
+            out_channels=10,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(0, 0),
+            batches=1,
+            width=8,
+            height=8,
+        )
         self._test_dq(model)
 
     def test_dq_conv2d_seq(self) -> None:
         model = Conv2dDQSeq()
-        self._test_dq(model, conv_count=2)
+        conv_count = sum(1 for m in model.modules() if type(m) is torch.nn.Conv2d)
+        self._test_dq(model, conv_count)
 
     def test_dq_conv2d_parallel(self) -> None:
         model = Conv2dDQParallel()
-        self._test_dq(model, conv_count=2)
+        conv_count = sum(1 for m in model.modules() if type(m) is torch.nn.Conv2d)
+        self._test_dq(model, conv_count)