From de3e6ffcae638cff616216c9d07c036afce807be Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo@zingo.org>
Date: Thu, 20 Nov 2025 13:50:34 +0100
Subject: [PATCH] Revert "Cortex_M backend: Add conv op (#15896)"

This reverts commit 213d24c514d9009a5aab27a0eb074c6cc7f01e65.
---
 backends/cortex_m/CMakeLists.txt              |   1 -
 backends/cortex_m/ops/op_quantized_conv2d.cpp | 236 -----------------
 backends/cortex_m/ops/operators.py            | 162 ------------
 backends/cortex_m/ops/operators.yaml          |   6 -
 backends/cortex_m/passes/__init__.py          |   2 +-
 .../passes/convert_to_cortex_m_pass.py        | 222 ----------------
 .../cortex_m/passes/cortex_m_pass_manager.py  |  33 +--
 .../passes/quantized_linear_fusion_pass.py    | 151 +++++++++++
 .../cortex_m/quantizer/operator_configs.py    |  12 -
 .../quantizer/quantization_configs.py         |  31 +--
 backends/cortex_m/quantizer/quantizer.py      |  20 --
 backends/cortex_m/test/ops/test_conv.py       | 247 ------------------
 12 files changed, 164 insertions(+), 959 deletions(-)
 delete mode 100644 backends/cortex_m/ops/op_quantized_conv2d.cpp
 delete mode 100644 backends/cortex_m/passes/convert_to_cortex_m_pass.py
 create mode 100644 backends/cortex_m/passes/quantized_linear_fusion_pass.py
 delete mode 100644 backends/cortex_m/test/ops/test_conv.py
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index ac330d4b015..5354186167a 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -56,7 +56,6 @@ set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_conv2d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_minimum.cpp
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
deleted file mode 100644
index ad14af98865..00000000000
--- a/backends/cortex_m/ops/op_quantized_conv2d.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright 2025 Arm Limited and/or its affiliates.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "cortex_m_ops_common.h"
-
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
-namespace cortex_m {
-namespace native {
-
-using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
-
-namespace {
-constexpr int64_t kConvDim = 4;
-
-bool validate_conv2d_arguments(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
-    const Tensor& output,
-    const IntArrayRef& stride,
-    const IntArrayRef& padding,
-    const IntArrayRef& dilation,
-    const Tensor& requantize_multipliers,
-    const Tensor& requantize_shifts) {
-  if (input.dim() != kConvDim || weight.dim() != kConvDim ||
-      output.dim() != kConvDim) {
-    ET_LOG(Error, "quantized_conv2d_out: tensors must be 4-D");
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  // Check for channels_last dim_order (NHWC: 0, 2, 3, 1)
-  // Skip check if channels == 1, as dim_order is ambiguous in that case
-  constexpr executorch::aten::DimOrderType kChannelsLastDimOrder[] = {
-      0, 2, 3, 1};
-  executorch::aten::ArrayRef<executorch::aten::DimOrderType>
-      channels_last_order(kChannelsLastDimOrder, 4);
-
-  if (input.size(1) > 1 && input.dim_order() != channels_last_order) {
-    ET_LOG(
-        Error,
-        "quantized_conv2d_out: input must have channels_last dim_order (NHWC)");
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  if (output.size(1) > 1 && output.dim_order() != channels_last_order) {
-    ET_LOG(
-        Error,
-        "quantized_conv2d_out: output must have channels_last dim_order (NHWC)");
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  if (input.scalar_type() != ScalarType::Char ||
-      output.scalar_type() != ScalarType::Char) {
-    ET_LOG(Error, "quantized_conv2d_out: input and output must be int8");
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  if (weight.scalar_type() != ScalarType::Char) {
-    ET_LOG(Error, "quantized_conv2d_out: weight must be int8");
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  if (bias.has_value() && bias.value().scalar_type() != ScalarType::Int) {
-    ET_LOG(Error, "quantized_conv2d_out: bias must be int32 if provided");
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  if (stride.size() != 2 || padding.size() != 2 || dilation.size() != 2) {
-    ET_LOG(
-        Error,
-        "quantized_conv2d_out: stride, padding, and dilation must have length 2");
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  const int64_t out_channels = output.size(1);
-  if (requantize_multipliers.size(0) != out_channels ||
-      requantize_shifts.size(0) != out_channels) {
-    ET_LOG(
-        Error,
-        "quantized_conv2d_out: per-channel params must match output channels (%zd)",
-        out_channels);
-    context.fail(Error::InvalidArgument);
-    return false;
-  }
-
-  return true;
-}
-} // namespace
-
-Tensor& quantized_conv2d_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    const Tensor& weight,
-    const torch::executor::optional<Tensor>& bias,
-    const IntArrayRef stride,
-    const IntArrayRef padding,
-    const IntArrayRef dilation,
-    const int64_t input_offset,
-    const int64_t output_offset,
-    const Tensor& requantize_multipliers,
-    const Tensor& requantize_shifts,
-    const int64_t activation_min,
-    const int64_t activation_max,
-    Tensor& out) {
-  if (!validate_conv2d_arguments(
-          context,
-          input,
-          weight,
-          bias,
-          out,
-          stride,
-          padding,
-          dilation,
-          requantize_multipliers,
-          requantize_shifts)) {
-    return out;
-  }
-
-  const int32_t batch = static_cast<int32_t>(input.size(0));
-  const int32_t input_channels = static_cast<int32_t>(input.size(1));
-  const int32_t input_height = static_cast<int32_t>(input.size(2));
-  const int32_t input_width = static_cast<int32_t>(input.size(3));
-
-  const int32_t kernel_output_channels = static_cast<int32_t>(weight.size(0));
-  const int32_t kernel_height = static_cast<int32_t>(weight.size(1));
-  const int32_t kernel_width = static_cast<int32_t>(weight.size(2));
-  const int32_t kernel_input_channels = static_cast<int32_t>(weight.size(3));
-
-  const int32_t output_channels = static_cast<int32_t>(out.size(1));
-  const int32_t output_height = static_cast<int32_t>(out.size(2));
-  const int32_t output_width = static_cast<int32_t>(out.size(3));
-
-  const int32_t input_offset_val = static_cast<int32_t>(input_offset);
-  const int32_t output_offset_val = static_cast<int32_t>(output_offset);
-  const int32_t activation_min_val = static_cast<int32_t>(activation_min);
-  const int32_t activation_max_val = static_cast<int32_t>(activation_max);
-
-  const cmsis_nn_dims input_dims{
-      batch, input_height, input_width, input_channels};
-  const cmsis_nn_dims filter_dims{
-      kernel_output_channels,
-      kernel_height,
-      kernel_width,
-      kernel_input_channels};
-  const cmsis_nn_dims output_dims{
-      batch, output_height, output_width, output_channels};
-  const cmsis_nn_dims bias_dims{1, 1, 1, output_channels};
-  const cmsis_nn_dims upscale_dims{1, 1, 1, 1};
-
-  cmsis_nn_conv_params conv_params;
-  conv_params.input_offset = input_offset_val;
-  conv_params.output_offset = output_offset_val;
-  conv_params.stride.h = static_cast<const int32_t>(stride[0]);
-  conv_params.stride.w = static_cast<const int32_t>(stride[1]);
-  conv_params.padding.h = static_cast<const int32_t>(padding[0]);
-  conv_params.padding.w = static_cast<const int32_t>(padding[1]);
-  conv_params.dilation.h = static_cast<const int32_t>(dilation[0]);
-  conv_params.dilation.w = static_cast<const int32_t>(dilation[1]);
-  conv_params.activation.min = activation_min_val;
-  conv_params.activation.max = activation_max_val;
-
-  cmsis_nn_per_channel_quant_params quant_params;
-  quant_params.multiplier = requantize_multipliers.data_ptr<int32_t>();
-  quant_params.shift = requantize_shifts.data_ptr<int32_t>();
-
-  const int8_t* input_data = input.const_data_ptr<int8_t>();
-  const int8_t* weight_data = weight.const_data_ptr<int8_t>();
-  int8_t* output_data = out.mutable_data_ptr<int8_t>();
-  const int32_t* bias_data =
-      bias.has_value() ? bias.value().const_data_ptr<int32_t>() : nullptr;
-
-  cmsis_nn_context cmsis_context;
-  cmsis_context.buf = nullptr;
-  cmsis_context.size = 0;
-
-  const size_t buffer_bytes = static_cast<size_t>(
-      arm_convolve_s8_get_buffer_size(&input_dims, &filter_dims));
-  if (buffer_bytes > 0) {
-    auto buffer_or_error =
-        context.allocate_temp(buffer_bytes, alignof(int16_t));
-    if (!buffer_or_error.ok()) {
-      if (buffer_or_error.error() != Error::NotFound) {
-        ET_LOG(
-            Error,
-            "quantized_conv2d_out: failed to allocate scratch buffer (%d)",
-            static_cast<int>(buffer_or_error.error()));
-        context.fail(buffer_or_error.error());
-        return out;
-      }
-    } else {
-      cmsis_context.buf = buffer_or_error.get();
-      cmsis_context.size = buffer_bytes;
-    }
-  }
-
-  const arm_cmsis_nn_status status = arm_convolve_wrapper_s8(
-      &cmsis_context,
-      &conv_params,
-      &quant_params,
-      &input_dims,
-      input_data,
-      &filter_dims,
-      weight_data,
-      &bias_dims,
-      bias_data,
-      &output_dims,
-      output_data);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "quantized_conv2d_out: arm_convolve_s8 failed with status %d",
-        status);
-    context.fail(Error::Internal);
-  }
-
-  return out;
-}
-
-} // namespace native
-} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
index fe175ca9783..8ad8f2a68e7 100644
--- a/backends/cortex_m/ops/operators.py
+++ b/backends/cortex_m/ops/operators.py
@@ -6,10 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 from math import prod
-from typing import Sequence
 
 import torch
-import torch.nn.functional as F
 from executorch.backends.cortex_m.passes.passes_utils import (
     requantize_cmsis,
     SHIFT_INT8,
@@ -410,163 +408,3 @@ def transpose_meta(input: torch.Tensor, perm) -> torch.Tensor:
 @impl(lib, "transpose", "CompositeExplicitAutograd")
 def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor:
     return input.permute(tuple(perm)).contiguous()
-
-
-# ===================================================================
-# QUANTIZED CONV2D OPERATION DEFINITION
-# ===================================================================
-
-lib.define(
-    "quantized_conv2d("
-    "Tensor input, "
-    "Tensor weight, "
-    "Tensor? bias, "
-    "int[] stride, "
-    "int[] padding, "
-    "int[] dilation, "
-    "int input_offset, "
-    "int output_offset, "
-    "Tensor requantize_multipliers, "
-    "Tensor requantize_shifts, "
-    "int activation_min, "
-    "int activation_max"
-    ") -> Tensor"
-)
-
-
-lib.define(
-    "quantized_conv2d.out("
-    "Tensor input, "
-    "Tensor weight, "
-    "Tensor? bias, "
-    "int[] stride, "
-    "int[] padding, "
-    "int[] dilation, "
-    "int input_offset, "
-    "int output_offset, "
-    "Tensor requantize_multipliers, "
-    "Tensor requantize_shifts, "
-    "int activation_min, "
-    "int activation_max, "
-    "*, Tensor(a!) out"
-    ") -> Tensor(a!)"
-)
-
-
-def _compute_conv2d_output_shape(
-    input_shape: torch.Size,
-    weight_shape: torch.Size,
-    stride: Sequence[int],
-    padding: Sequence[int],
-    dilation: Sequence[int],
-) -> torch.Size:
-    batch = input_shape[0]
-    in_height = input_shape[2]
-    in_width = input_shape[3]
-    # We store the weights in OHWI layout (out, kernel_h, kernel_w, in)
-    kernel_height = weight_shape[1]
-    kernel_width = weight_shape[2]
-
-    stride_h, stride_w = stride
-    pad_h, pad_w = padding
-    dilation_h, dilation_w = dilation
-
-    out_channels = weight_shape[0]
-    out_height = (
-        in_height + 2 * pad_h - dilation_h * (kernel_height - 1) - 1
-    ) // stride_h + 1
-    out_width = (
-        in_width + 2 * pad_w - dilation_w * (kernel_width - 1) - 1
-    ) // stride_w + 1
-    return torch.Size([batch, out_channels, out_height, out_width])
-
-
-@register_fake("cortex_m::quantized_conv2d")
-def quantized_conv2d_meta(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor | None,
-    stride: Sequence[int],
-    padding: Sequence[int],
-    dilation: Sequence[int],
-    input_offset: int,
-    output_offset: int,
-    requantize_multipliers: torch.Tensor,
-    requantize_shifts: torch.Tensor,
-    activation_min: int,
-    activation_max: int,
-) -> torch.Tensor:
-    stride_vals = list(stride)
-    padding_vals = list(padding)
-    dilation_vals = list(dilation)
-    output_shape = _compute_conv2d_output_shape(
-        input.shape, weight.shape, stride_vals, padding_vals, dilation_vals
-    )
-    return torch.empty(
-        output_shape,
-        dtype=torch.int8,
-        device=input.device,
-        memory_format=torch.channels_last,
-    )
-
-
-@impl(lib, "quantized_conv2d", "CompositeExplicitAutograd")
-def quantized_conv2d_impl(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor | None,
-    stride: Sequence[int],
-    padding: Sequence[int],
-    dilation: Sequence[int],
-    input_offset: int,
-    output_offset: int,
-    requantize_multipliers: torch.Tensor,
-    requantize_shifts: torch.Tensor,
-    activation_min: int,
-    activation_max: int,
-) -> torch.Tensor:
-    if input.dim() != 4 or weight.dim() != 4:
-        raise RuntimeError("quantized_conv2d expects 4D input and weight tensors")
-    # Convert to int32 for accumulation and apply offsets
-    input_int32 = input.to(torch.int32) + int(input_offset)
-    weight_int32 = weight.to(torch.int32)
-
-    if bias is None:
-        bias_int32 = torch.zeros(
-            weight.shape[0], dtype=torch.int32, device=input.device
-        )
-    else:
-        bias_int32 = bias.to(torch.int32)
-
-    input_channels = input.shape[1]
-    kernel_input_channels = weight.shape[3]
-    groups = input_channels // kernel_input_channels
-
-    # Convert weights back to OIHW layout expected by torch.nn.functional.conv2d
-    weight_oi_hw = weight_int32.permute(0, 3, 1, 2).contiguous()
-
-    conv_acc = F.conv2d(
-        input_int32,
-        weight_oi_hw,
-        bias_int32,
-        stride=tuple(stride),
-        padding=tuple(padding),
-        dilation=tuple(dilation),
-        groups=groups,
-    )
-
-    result_channels = []
-    for output_channel_i in range(conv_acc.shape[1]):
-        result_channel = requantize_cmsis(
-            conv_acc[:, output_channel_i, :, :],
-            int(requantize_multipliers[output_channel_i]),
-            int(requantize_shifts[output_channel_i]),
-        )
-        result_channels.append(result_channel)
-
-    result = torch.stack(result_channels, dim=1)
-
-    result += output_offset
-    result = torch.clamp(result, activation_min, activation_max)
-
-    return result.to(torch.int8)
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
index 0b0b2f5c715..30365e730da 100644
--- a/backends/cortex_m/ops/operators.yaml
+++ b/backends/cortex_m/ops/operators.yaml
@@ -52,9 +52,3 @@
   kernels:
     - arg_meta: null
       kernel_name: cortex_m::transpose_out
-
-- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: cortex_m::quantized_conv2d_out
diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py
index d1bb580d871..26456138cb2 100644
--- a/backends/cortex_m/passes/__init__.py
+++ b/backends/cortex_m/passes/__init__.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from .convert_to_cortex_m_pass import ConvertToCortexMPass  # noqa
+from .quantized_linear_fusion_pass import QuantizedLinearFusionPass  # noqa
 from .quantized_op_fusion_pass import QuantizedOpFusionPass  # noqa
 from .replace_quant_nodes_pass import ReplaceQuantNodesPass  # noqa
 from .cortex_m_pass_manager import CortexMPassManager  # noqa  # usort: skip
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
deleted file mode 100644
index c849b2949bf..00000000000
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import executorch.backends.cortex_m.ops.operators  # noqa
-
-import torch
-import torch.fx
-from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
-
-from executorch.backends.transforms.utils import (
-    create_constant_placeholder,
-    get_param_tensor,
-)
-
-from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
-from executorch.exir.dialects._ops import ops as exir_ops
-from torch.export.graph_signature import InputKind
-from torch.fx.passes.infra.pass_manager import PassResult
-
-
-class ConvertToCortexMPass(XNNPACKPass):
-    """
-    Cortex-M backend pass for replacing supported quantized kernels with Cortex-M
-    accelerated kernels.
-
-    Used for ops which require changes to input tensors which is not supported
-    by call_operator.
-    """
-
-    def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset):
-        """
-        Computes the precomputed kernel sum term (bias optional)
-            a * sum_j(wij + b) + ci
-
-        for i = (1, ..., n), where j indexes the input activations.
-        """
-        weights_transposed = weights.T
-        weights_int32 = weights_transposed.to(torch.int32)
-        offset_weights = weights_int32 + weight_offset
-        kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32)
-        kernel_sum_offset = kernel_sum * input_offset
-
-        if bias is not None:
-            kernel_sum_offset += bias
-
-        return kernel_sum_offset
-
-    def _get_linear_replacement(self, node):
-        """
-         Let
-        - yi be the output activations (y1, ... yn)
-        - xj be the input activations (x1, ... xm)
-        - wij be the weights (w11, ... wnm)
-        - a be the input offset
-        - b be the weight offset
-        - ci be the bias
-
-        Then the linear operation can be written as:
-        yi = sum_j((xj + a) * (wij + b)) + ci
-        = sum_j(xj*wij + xj*b + a*wij + a*b) + ci
-        = sum_j(xj*wij) + sum_j(xj)*b + (a * sum_j(wij + b) + ci)
-        = sum_j(xj*wij) + sum_j(xj)*b + kernel_sum
-
-        where kernel_sum is precomputed aot.
-        """
-        input_scale = node.meta["input_qparams"][0].scale
-        input_zp = node.meta["input_qparams"][0].zp
-        weight_scale = node.meta["input_qparams"][1].scale
-        weight_zp = node.meta["input_qparams"][1].zp
-        output_scale = node.meta["output_qparams"][0].scale
-        output_zp = node.meta["output_qparams"][0].zp
-        output_min = node.meta["output_qparams"][0].qmin
-        output_max = node.meta["output_qparams"][0].qmax
-
-        quantized_multiplier, quantized_shift = quantize_multiplier_aot(
-            (input_scale * weight_scale) / output_scale
-        )
-
-        # TODO: Add support for configuring the backend to support other extensions.
-        # Kernel sum is only used in the CMSIS-NN implementation for the MVE extension,
-        # so this should be optional.
-        weights = node.args[1]
-        weights_tensor = get_param_tensor(self.exported_program, weights)
-        bias_tensor = (
-            get_param_tensor(self.exported_program, node.args[2])
-            if len(node.args) > 2
-            else None
-        )
-        kernel_sum_tensor = self._compute_kernel_sum(
-            weights_tensor, bias_tensor, -input_zp, -weight_zp
-        )
-        with node.graph.inserting_after(weights):
-            kernel_sum = create_constant_placeholder(
-                self.exported_program,
-                node.graph,
-                node.name + "_kernel_sum",
-                InputKind.PARAMETER,
-                kernel_sum_tensor,
-            )
-
-        args = (
-            node.args[0],
-            weights,
-            None,
-            kernel_sum,
-            -input_zp,
-            -weight_zp,
-            output_zp,
-            [quantized_multiplier],
-            [quantized_shift],
-            output_max,
-            output_min,
-        )
-
-        return exir_ops.edge.cortex_m.quantized_linear.default, args
-
-    def _get_convolution_replacement(self, node) -> int:
-        (
-            x,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-        ) = node.args
-
-        # Extract values
-        input_scale = node.meta["input_qparams"][0].scale
-        input_zero_point = node.meta["input_qparams"][0].zp
-        weight_scales = node.meta["input_qparams"][1].scale
-        if not isinstance(weight_scales, list):
-            weight_scales = [weight_scales] * weight.data.shape[0]
-
-        output_scale = node.meta["output_qparams"][0].scale
-        output_zero_point = node.meta["output_qparams"][0].zp
-
-        quantized_multipliers = []
-        quantized_shifts = []
-        for weight_scale in weight_scales:
-            quantized_multiplier, quantized_shift = quantize_multiplier_aot(
-                input_scale * weight_scale / output_scale
-            )
-            quantized_multipliers.append(quantized_multiplier)
-            quantized_shifts.append(quantized_shift)
-
-        # Permute the weight tensor to the OHWI layout expected by CMSIS-NN.
-        weight_tensor = get_param_tensor(self.exported_program, weight)
-        weight_permuted = weight_tensor.permute(0, 2, 3, 1).contiguous(
-            memory_format=torch.channels_last
-        )
-
-        with node.graph.inserting_after(weight):
-            weight_nhwc = create_constant_placeholder(
-                self.exported_program,
-                node.graph,
-                node.name + "_weight_nhwc",
-                InputKind.PARAMETER,
-                weight_permuted,
-            )
-
-        new_args = (
-            x,
-            weight_nhwc,
-            bias,
-            stride,
-            padding,
-            dilation,
-            -input_zero_point,
-            output_zero_point,
-            torch.tensor(quantized_multipliers, dtype=torch.int32),
-            torch.tensor(quantized_shifts, dtype=torch.int32),
-            -128,
-            127,
-        )
-        return exir_ops.edge.cortex_m.quantized_conv2d.default, new_args
-
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        modified = False
-        for node in graph_module.graph.nodes:
-            if node.op != "call_function":
-                continue
-            if (
-                node.meta.get("input_qparams", {}) == {}
-                or node.meta.get("output_qparams", {}) == {}
-            ):
-                continue
-
-            match node.target:
-                case exir_ops.edge.aten.linear.default:
-                    op, args = self._get_linear_replacement(node)
-                case exir_ops.edge.aten.convolution.default:
-                    op, args = self._get_convolution_replacement(node)
-                case _:
-                    continue
-
-            with graph_module.graph.inserting_before(node):
-                cortex_m_op = graph_module.graph.create_node(
-                    "call_function",
-                    target=op,
-                    args=args,
-                    kwargs={},
-                )
-
-                node.replace_all_uses_with(cortex_m_op)
-                graph_module.graph.erase_node(node)
-
-            modified = True
-
-        if modified:
-            graph_module.graph.eliminate_dead_code()
-            graph_module.recompile()
-            graph_module = super().call(graph_module).graph_module
-
-        return PassResult(graph_module, modified)
diff --git a/backends/cortex_m/passes/cortex_m_pass_manager.py b/backends/cortex_m/passes/cortex_m_pass_manager.py
index 948a60121b4..2b880f5ed05 100644
--- a/backends/cortex_m/passes/cortex_m_pass_manager.py
+++ b/backends/cortex_m/passes/cortex_m_pass_manager.py
@@ -4,34 +4,30 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import inspect
-
 from executorch.backends.arm._passes import (
     FoldAndAnnotateQParamsPass,
     ScalarsToAttributePass,
 )
 from executorch.backends.cortex_m.passes import (
-    ConvertToCortexMPass,
+    QuantizedLinearFusionPass,
     QuantizedOpFusionPass,
     ReplaceQuantNodesPass,
 )
 from executorch.backends.transforms.replace_scalar_with_tensor import (
     ReplaceScalarWithTensorArgPass,
 )
+from executorch.backends.xnnpack._passes import XNNPACKPassManager
 from executorch.exir.pass_base import ExportPass
-from executorch.exir.pass_manager import PassManager
-from executorch.exir.program._program import _transform
-from torch.export import ExportedProgram
 
 
-class CortexMPassManager(PassManager):
+class CortexMPassManager(XNNPACKPassManager):
 
     pass_list: list[ExportPass] = [
         FoldAndAnnotateQParamsPass,
         ReplaceScalarWithTensorArgPass,
         ReplaceQuantNodesPass,
         QuantizedOpFusionPass,
-        ConvertToCortexMPass,
+        QuantizedLinearFusionPass,
     ]
 
     pass_list_transform_for_annotation: list[ExportPass] = [
@@ -40,29 +36,10 @@ class CortexMPassManager(PassManager):
     ]
 
     def __init__(self, exported_program, passes=None):
-        self.exported_program = exported_program
-        if passes is not None:
-            self.passes = passes
-        else:
-            self.passes = self.pass_list
+        super().__init__(exported_program, passes or self.pass_list)
 
     def transform_for_annotation(self, model):
         passes = self.pass_list_transform_for_annotation
         for p in passes:
             model = p().call(model).graph_module
         return model
-
-    def transform(self) -> ExportedProgram:
-        ep = self.exported_program
-        for pass_ in self.passes:
-            signature = inspect.signature(pass_.__init__)
-            if "exported_program" in signature.parameters:
-                transform_pass = pass_(ep)
-            elif issubclass(pass_, ExportPass):
-                transform_pass = pass_()
-            else:
-                raise RuntimeError(
-                    f"Expecting ExportPass or ExportPass(), but got pass: {pass_} with type: {type(pass_)}"
-                )
-            ep = _transform(ep, transform_pass)
-        return ep
diff --git a/backends/cortex_m/passes/quantized_linear_fusion_pass.py b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
new file mode 100644
index 00000000000..f921f5ce621
--- /dev/null
+++ b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
@@ -0,0 +1,151 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import executorch.backends.cortex_m.ops.operators  # noqa
+
+import torch
+import torch.fx
+from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
+
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    get_param_tensor,
+)
+
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export.graph_signature import InputKind
+from torch.fx.passes.infra.pass_manager import PassResult
+
+
+class QuantizedLinearFusionPass(XNNPACKPass):
+    """
+    Cortex-M backend pass that fuses quantized linear-like patterns.
+    Fuses: dequantize -> [linear/addmm/fc_ops] -> quantize
+    Into: cortex_m.quantized_linear.default with direct parameters.
+
+    Note that the optimzed implementation makes use of the following rewrite:
+
+    Let
+    - yi be the output activations (y1, ... yn)
+    - xj be the input activations (x1, ... xm)
+    - wij be the weights (w11, ... wnm)
+    - a be the input offset
+    - b be the weight offset
+    - ci be the bias
+
+    Then the linear operation can be written as:
+    yi = sum_j((xj + a) * (wij + b)) + ci
+       = sum_j(xj*wij + xj*b + a*wij + a*b) + ci
+       = sum_j(xj*wij) + sum_j(xj)*b + (a * sum_j(wij + b) + ci)
+       = sum_j(xj*wij) + sum_j(xj)*b + kernel_sum
+
+    where kernel_sum is precomputed aot.
+    """
+
+    def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset):
+        """
+        Computes the precomputed kernel sum term (bias optional)
+            a * sum_j(wij + b) + ci
+
+        as defined above, for i = (1, ..., n) where j indexes the input activations.
+        """
+        weights_transposed = weights.T
+        weights_int32 = weights_transposed.to(torch.int32)
+        offset_weights = weights_int32 + weight_offset
+        kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32)
+        kernel_sum_offset = kernel_sum * input_offset
+
+        if bias is not None:
+            kernel_sum_offset += bias
+
+        return kernel_sum_offset
+
+    def _get_linear_replacement(self, args, meta, node):
+        input_scale = meta["input_qparams"][0].scale
+        input_zp = meta["input_qparams"][0].zp
+        weight_scale = meta["input_qparams"][1].scale
+        weight_zp = meta["input_qparams"][1].zp
+        output_scale = meta["output_qparams"][0].scale
+        output_zp = meta["output_qparams"][0].zp
+        output_min = meta["output_qparams"][0].qmin
+        output_max = meta["output_qparams"][0].qmax
+
+        quantized_multiplier, quantized_shift = quantize_multiplier_aot(
+            (input_scale * weight_scale) / output_scale
+        )
+
+        # TODO: Add support for configuring the backend to support other extensions.
+        # Kernel sum is only used in the CMSIS-NN implementation for the MVE extension,
+        # so this should be optional.
+        weights = args[1]
+        weights_tensor = get_param_tensor(self.exported_program, weights)
+        bias_tensor = (
+            get_param_tensor(self.exported_program, args[2]) if len(args) > 2 else None
+        )
+        kernel_sum_tensor = self._compute_kernel_sum(
+            weights_tensor, bias_tensor, -input_zp, -weight_zp
+        )
+        with node.graph.inserting_after(weights):
+            kernel_sum = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_kernel_sum",
+                InputKind.PARAMETER,
+                kernel_sum_tensor,
+            )
+
+        args = (
+            args[0],
+            weights,
+            None,
+            kernel_sum,
+            -input_zp,
+            -weight_zp,
+            output_zp,
+            [quantized_multiplier],
+            [quantized_shift],
+            output_max,
+            output_min,
+        )
+
+        return args
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target != exir_ops.edge.aten.linear.default:
+                continue
+            if (
+                node.meta.get("input_qparams", {}) == {}
+                or node.meta.get("output_qparams", {}) == {}
+            ):
+                continue
+
+            args = self._get_linear_replacement(node.args, node.meta, node)
+            with graph_module.graph.inserting_before(node):
+                cortex_m_linear = graph_module.graph.create_node(
+                    "call_function",
+                    target=exir_ops.edge.cortex_m.quantized_linear.default,
+                    args=args,
+                    kwargs={},
+                )
+
+                node.replace_all_uses_with(cortex_m_linear)
+                graph_module.graph.erase_node(node)
+
+            modified = True
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, modified)
diff --git a/backends/cortex_m/quantizer/operator_configs.py b/backends/cortex_m/quantizer/operator_configs.py
index c6b15fb9a78..2936129819a 100644
--- a/backends/cortex_m/quantizer/operator_configs.py
+++ b/backends/cortex_m/quantizer/operator_configs.py
@@ -10,7 +10,6 @@
 import torch
 
 from executorch.backends.cortex_m.quantizer.quantization_configs import (
-    INT8_PER_CHANNEL_CONFIG,
     INT8_PER_TENSOR_CONFIG,
 )
 from torchao.quantization.pt2e.quantizer import OperatorConfig
@@ -26,12 +25,6 @@
     [torch.ops.aten.linear.default, torch.ops.aten.relu.default],
 ]
 
-CONV_OP_PATTERNS = [
-    [torch.ops.aten.conv1d.default],
-    [torch.ops.aten.conv2d.default],
-    [torch.ops.aten.conv3d.default],
-]
-
 # ----------------- OPERATOR CONFIG PRESETS -----------------
 INT8_BINARY_OPS_OPERATOR_CONFIG = OperatorConfig(
     INT8_PER_TENSOR_CONFIG, BINARY_OP_PATTERNS
@@ -41,8 +34,3 @@
     INT8_PER_TENSOR_CONFIG,
     LINEAR_OP_PATTERNS,
 )
-
-INT8_CONV_OPERATOR_CONFIG = OperatorConfig(
-    INT8_PER_CHANNEL_CONFIG,
-    CONV_OP_PATTERNS,
-)
diff --git a/backends/cortex_m/quantizer/quantization_configs.py b/backends/cortex_m/quantizer/quantization_configs.py
index c6600241b6d..7f43a89daad 100644
--- a/backends/cortex_m/quantizer/quantization_configs.py
+++ b/backends/cortex_m/quantizer/quantization_configs.py
@@ -5,11 +5,7 @@
 
 
 import torch
-from torchao.quantization.pt2e import (
-    HistogramObserver,
-    MinMaxObserver,
-    PerChannelMinMaxObserver,
-)
+from torchao.quantization.pt2e import HistogramObserver, MinMaxObserver
 from torchao.quantization.pt2e.quantizer import (
     DerivedQuantizationSpec,
     QuantizationConfig,
@@ -25,9 +21,8 @@
 
 INT8_WEIGHT_PER_CHANNEL_QSPEC = QuantizationSpec(
     dtype=torch.int8,
-    observer_or_fake_quant_ctr=PerChannelMinMaxObserver,
+    observer_or_fake_quant_ctr=MinMaxObserver,
     qscheme=torch.per_channel_symmetric,
-    ch_axis=0,
 )
 
 INT8_ACTIVATION_PER_TENSOR_QSPEC = QuantizationSpec(
@@ -38,9 +33,8 @@
 
 INT8_ACTIVATION_PER_CHANNEL_QSPEC = QuantizationSpec(
     dtype=torch.int8,
-    observer_or_fake_quant_ctr=PerChannelMinMaxObserver,
+    observer_or_fake_quant_ctr=HistogramObserver,
     qscheme=torch.per_channel_affine,
-    ch_axis=0,
 )
 
 
@@ -67,18 +61,7 @@ def _get_int32_bias_qspec(node):
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.int32).min,
         quant_max=torch.iinfo(torch.int32).max - 1,
-    )
-
-
-def _get_int32_per_channel_bias_qspec(node):
-    return DerivedQuantizationSpec(
-        derived_from=[(node.args[0], node), (node.args[1], node)],  # type: ignore[list-item]
-        derive_qparams_fn=_derive_bias_qparams_fn,
-        dtype=torch.int32,
-        quant_min=torch.iinfo(torch.int32).min,
-        quant_max=torch.iinfo(torch.int32).max - 1,
-        qscheme=torch.per_channel_symmetric,
-        ch_axis=0,
+        qscheme=torch.per_tensor_symmetric,
     )
 
 
@@ -92,8 +75,8 @@ def _get_int32_per_channel_bias_qspec(node):
 
 
 INT8_PER_CHANNEL_CONFIG = QuantizationConfig(
-    INT8_ACTIVATION_PER_TENSOR_QSPEC,
-    INT8_ACTIVATION_PER_TENSOR_QSPEC,
+    INT8_ACTIVATION_PER_CHANNEL_QSPEC,
+    INT8_ACTIVATION_PER_CHANNEL_QSPEC,
     INT8_WEIGHT_PER_CHANNEL_QSPEC,
-    _get_int32_per_channel_bias_qspec,
+    _get_int32_bias_qspec,
 )
diff --git a/backends/cortex_m/quantizer/quantizer.py b/backends/cortex_m/quantizer/quantizer.py
index 8bfc32049ed..1f9b06c27ec 100644
--- a/backends/cortex_m/quantizer/quantizer.py
+++ b/backends/cortex_m/quantizer/quantizer.py
@@ -12,9 +12,7 @@
 from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
 from executorch.backends.cortex_m.quantizer.operator_configs import (
     BINARY_OP_PATTERNS,
-    CONV_OP_PATTERNS,
     INT8_BINARY_OPS_OPERATOR_CONFIG,
-    INT8_CONV_OPERATOR_CONFIG,
     INT8_LINEAR_OPERATOR_CONFIG,
 )
 from executorch.backends.cortex_m.quantizer.quantization_configs import (
@@ -49,30 +47,12 @@ def broadcasting_filter(self, node: Optional[Node]) -> bool:
 
         return False
 
-    def nchw_filter(self, node: Optional[Node]) -> bool:
-        """
-        Filter function to exclude nodes that use NCHW memory format.
-        """
-        if node is None:
-            return False
-        if [node.target] not in CONV_OP_PATTERNS:
-            return False
-
-        tensor = get_first_fake_tensor(node)
-        if tensor is None:
-            return False
-
-        return not tensor.is_contiguous(memory_format=torch.channels_last)
-
     def __init__(self) -> None:
         quantizers: List[Quantizer] = [
             OperatorConfigQuantizer(
                 INT8_BINARY_OPS_OPERATOR_CONFIG, filter_fn=self.broadcasting_filter
             ),
             OperatorConfigQuantizer(INT8_LINEAR_OPERATOR_CONFIG),
-            OperatorConfigQuantizer(
-                INT8_CONV_OPERATOR_CONFIG, filter_fn=self.nchw_filter
-            ),
             InputQuantizer(INT8_PER_TENSOR_CONFIG),
             OutputQuantizer(INT8_PER_TENSOR_CONFIG),
             SharedQspecQuantizer(),
diff --git a/backends/cortex_m/test/ops/test_conv.py b/backends/cortex_m/test/ops/test_conv.py
deleted file mode 100644
index c6bb4815dca..00000000000
--- a/backends/cortex_m/test/ops/test_conv.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import torch
-from executorch.backends.arm.test.common import parametrize
-from executorch.backends.cortex_m.test.tester import (
-    CortexMTester,
-    McuTestCase,
-    ramp_tensor,
-)
-
-
-class CortexMConv1D(torch.nn.Module):
-    ops_before_transforms = {}
-    ops_after_transforms = {}
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-        self.conv = torch.nn.Conv1d(*args, **kwargs, bias=False)
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class CortexMConv2D(torch.nn.Module):
-    ops_before_transforms = {
-        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1,
-    }
-
-    ops_after_transforms = {
-        "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1,
-        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
-        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
-    }
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(*args, **kwargs, bias=False)
-        self.conv.weight.data.fill_(1.0)
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class CortexMConv2DBias(torch.nn.Module):
-    ops_before_transforms = {
-        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 2,
-    }
-
-    ops_after_transforms = {
-        "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1,
-        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
-        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
-    }
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(*args, **kwargs, bias=True)
-
-    def forward(self, x):
-
-        return self.conv(x)
-
-
-class CortexMConv3D(torch.nn.Module):
-    ops_before_transforms = {}
-
-    ops_after_transforms = {}
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-        self.conv = torch.nn.Conv3d(*args, **kwargs, bias=False)
-        self.conv.weight.data.fill_(2.0)
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class CortexMConv2Dx3(torch.nn.Module):
-    ops_before_transforms = {
-        "executorch_exir_dialects_edge__ops_aten_convolution_default": 3,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 4,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 4,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 3,
-    }
-
-    ops_after_transforms = {
-        "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 3,
-        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
-        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
-    }
-
-    def __init__(self):
-        super().__init__()
-        self.conv1 = torch.nn.Conv2d(3, 8, 3, padding=1, bias=False)
-        self.conv2 = torch.nn.Conv2d(8, 16, 3, padding=1, bias=False)
-        self.conv3 = torch.nn.Conv2d(16, 8, 3, padding=1, bias=False)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        return x
-
-
-class CortexMConv2DReLU(torch.nn.Module):
-    ops_before_transforms = {
-        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_relu_default": 1,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
-        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1,
-    }
-
-    ops_after_transforms = {
-        "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1,
-        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
-        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_relu_default": 1,
-    }
-
-    def __init__(self):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(4, 8, 3, padding=1, bias=True)
-        self.relu = torch.nn.ReLU()
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.relu(x)
-        return x
-
-
-# in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode
-test_cases = {
-    "conv2d": McuTestCase(
-        model=CortexMConv2D(2, 4, 3),
-        example_inputs=(
-            ramp_tensor(1, 5, (1, 2, 5, 5)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_stride": McuTestCase(
-        model=CortexMConv2D(3, 4, (1, 2), stride=2),
-        example_inputs=(
-            ramp_tensor(-100, 10, (3, 3, 8, 8)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_padding": McuTestCase(
-        model=CortexMConv2D(3, 2, 3, padding=(4, 1)),
-        example_inputs=(
-            ramp_tensor(0, 1, (2, 3, 5, 5)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_dilation": McuTestCase(
-        model=CortexMConv2D(1, 4, 3, dilation=(2, 2)),
-        example_inputs=(
-            ramp_tensor(0, 10, (3, 1, 8, 8)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_groups": McuTestCase(
-        model=CortexMConv2D(4, 4, 1, groups=2),
-        example_inputs=(
-            ramp_tensor(0, 10, (1, 4, 1, 1)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_bias_ch_out_1": McuTestCase(
-        model=CortexMConv2DBias(5, 1, 1),
-        example_inputs=(
-            ramp_tensor(0, 10, (2, 5, 3, 3)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_bias_ch_out_4": McuTestCase(
-        model=CortexMConv2DBias(5, 4, (1, 2)),
-        example_inputs=(
-            ramp_tensor(-3, 3, (2, 5, 10, 10)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_nchw": McuTestCase(
-        model=CortexMConv2D(5, 5, 1),
-        example_inputs=(ramp_tensor(0, 10, (1, 5, 8, 8)),),
-    ),
-    "conv1d": McuTestCase(
-        model=CortexMConv1D(1, 1, 1),
-        example_inputs=(ramp_tensor(0, 10, (1, 3, 2)),),
-    ),
-    "conv3d": McuTestCase(
-        model=CortexMConv3D(1, 1, 1),
-        example_inputs=(
-            ramp_tensor(-1000, 1000, (2, 1, 3, 3, 3)).to(
-                memory_format=torch.channels_last_3d
-            ),
-        ),
-    ),
-    "conv2d_x3": McuTestCase(
-        model=CortexMConv2Dx3(),
-        example_inputs=(
-            ramp_tensor(0, 10, (1, 3, 8, 8)).to(memory_format=torch.channels_last),
-        ),
-    ),
-    "conv2d_relu": McuTestCase(
-        model=CortexMConv2DReLU(),
-        example_inputs=(
-            ramp_tensor(-5, 5, (1, 4, 8, 8)).to(memory_format=torch.channels_last),
-        ),
-    ),
-}
-
-
-xfails_dialect = {
-    "conv2d_dilation": "NotImplementedError: 'slow_conv_dilated<>' not implemented for 'Int'",
-    "conv1d": "Currently not supported.",
-    "conv2d_nchw": "Currently not supported.",
-    "conv3d": "Currently not supported.",
-    "conv2d_relu": "Currently not supported.",
-}
-
-
-@parametrize("test_case", test_cases, xfails=xfails_dialect)
-def test_dialect_conv2d(test_case):
-    tester = CortexMTester(test_case.model, test_case.example_inputs)
-    tester.test_dialect(
-        test_case.model.ops_before_transforms,
-        test_case.model.ops_after_transforms,
-        qtol=1,
-    )
-
-
-xfails_implementation = {
-    "conv1d": "Currently not supported.",
-    "conv2d_nchw": "Currently not supported.",
-    "conv3d": "Currently not supported.",
-    "conv2d_relu": "Currently not supported.",
-}
-
-
-@parametrize("test_case", test_cases, xfails=xfails_implementation)
-def test_implementation_conv2d(test_case):
-    tester = CortexMTester(test_case.model, test_case.example_inputs)
-    tester.test_implementation(qtol=1)