From 102f34493bde6da600119b246071064a4eda5dd7 Mon Sep 17 00:00:00 2001
From: Roman Janik <roman.janik@nxp.com>
Date: Fri, 12 Sep 2025 14:22:33 +0200
Subject: [PATCH 1/2] Remove IR optimization in move_relu_before_concat.py

---
 .../optimizations/move_relu_before_concat.py  | 107 ------------------
 .../backend/ir/tflite_optimizer/optimizer.py  |   8 --
 2 files changed, 115 deletions(-)
 delete mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py

diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
deleted file mode 100755
index 4d10b7c80ae..00000000000
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2024 NXP
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import defaultdict
-from copy import deepcopy
-
-from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
-from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import (
-    AllInputsComeFrom,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
-    BaseOptimization,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
-    Op,
-    PatternMatcher,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
-    TensorHasOneConsumer,
-    TensorsHaveSameQuantization,
-)
-
-
-class MoveActivationBeforeConcatenation(BaseOptimization):
-    """
-    Move some operators around in the following pattern.
-    This is a common pattern that emerges from the conversion of separable convolutions.
-
-          │                │                            │                │
-      ┌───▼────┐       ┌───▼────┐                   ┌───▼────┐       ┌───▼────┐
-      │ Conv2D │  ...  │ Conv2D │                   │ Conv2D │  ...  │ Conv2D │
-      └───┬────┘       └───┬────┘                   └───┬────┘       └───┬────┘
-          └──┐          ┌──┘                            │                │
-          ┌──▼──────────▼─┐                          ┌──▼───┐         ┌──▼───┐
-          │ Concatenation │           ─────►         │ Relu │   ...   │ Relu │
-          └───────┬───────┘                          └──┬───┘         └──┬───┘
-                  │  'x'                                └──┐          ┌──┘
-               ┌──▼───┐                                 ┌──▼──────────▼─┐
-               │ Relu │                                 │ Concatenation │
-               └──┬───┘                                 └───────┬───────┘
-                  │  'y'                                        │
-    """
-
-    activations = ["Relu", "ReluN1To1", "Relu6", "Tanh", "Sign"]
-
-    def __call__(self) -> bool:
-        matcher = PatternMatcher(
-            self._builder,
-            [
-                Op(["Concatenation"], None, ["x"], [AllInputsComeFrom("Conv2D")]),
-                Op(self.activations, ["x"], ["y"]),
-            ],
-            [
-                TensorHasOneConsumer("x"),
-                # If the activation function is not changing the quantization parameters, it can be moved without
-                #  messing with the quantization elsewhere.
-                TensorsHaveSameQuantization(["x", "y"]),
-            ],
-        )
-
-        to_remove = []
-
-        # Mapping an operator to a list of operators. These operators (value) will later be added into the TFLite
-        #  model's `operators` in front of the specified operator (key).
-        to_add: dict[tflite_model.Operator, list[tflite_model.Operator]] = defaultdict(
-            lambda: []
-        )
-
-        for [concat, activation], _, _, _ in matcher.match_patterns():
-            new_concat_inputs = []
-            for concat_input in concat.tmp_inputs:
-                # Create a new operator for the activation function.
-                new_activation = deepcopy(activation)
-                new_activation.tmp_inputs = [concat_input]
-                new_activation_output = self._builder.duplicate_tensor(concat_input)
-                new_activation.tmp_outputs = [new_activation_output]
-
-                to_add[concat].append(
-                    new_activation
-                )  # Insert the new activation into the model later.
-
-                new_concat_inputs.append(
-                    new_activation_output
-                )  # Connect the activation with the `Concatenation`.
-
-            concat.tmp_inputs = new_concat_inputs
-
-            # Tensor rule ensures that only the activation functions is using the output of the `Concatenation`.
-            # It is safe to bypass.
-            concat.tmp_outputs[0] = activation.tmp_outputs[0]
-            to_remove.append(activation)
-
-        operators = self._builder.get_operators()
-
-        # Add the new activations into the model.
-        for concat, activations in to_add.items():
-            idx = operators.index(concat)
-            for activation in activations:
-                operators.insert(idx, activation)
-
-        # Remove the old activations.
-        for activation in to_remove:
-            operators.remove(activation)
-
-        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
index 3611c55e995..52de6f224eb 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
@@ -11,9 +11,6 @@
 
 from executorch.backends.nxp.backend.ir import logger
 from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import (
-    MoveActivationBeforeConcatenation,
-)
 from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.permute_fully_connected_weights_after_reshape import (
     PermuteFullyConnectedWeightsAfterReshape,
 )
@@ -29,8 +26,6 @@ class Optimization(Enum):
 
     PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE = 12
 
-    MOVE_ACTIVATION_BEFORE_CONCAT = 15
-
 
 class Optimizer:
     """
@@ -68,9 +63,6 @@ def __init__(
             Optimization.PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE: PermuteFullyConnectedWeightsAfterReshape(
                 builder, conversion_config
             ),
-            Optimization.MOVE_ACTIVATION_BEFORE_CONCAT: MoveActivationBeforeConcatenation(
-                builder, conversion_config
-            ),
         }
 
     def optimize(

From 73fb5e23a176da8cb75bb29374e4cba5e224f568 Mon Sep 17 00:00:00 2001
From: Roman Janik <roman.janik@nxp.com>
Date: Wed, 24 Sep 2025 16:51:03 +0200
Subject: [PATCH 2/2] Add Move activation before concat pass, Concat cluster
 quantization

---
 .../move_activation_before_concat.py          | 102 ++
 .../aten_passes/neutron_aten_pass_manager.py  |   9 +-
 backends/nxp/quantizer/neutron_quantizer.py   |  10 +-
 backends/nxp/quantizer/patterns.py            | 147 ++-
 backends/nxp/tests/test_batch_norm_fusion.py  |   9 +-
 backends/nxp/tests/test_gru_splitting.py      |  17 +-
 .../nxp/tests/test_linear_and_add_fusion.py   |  55 +-
 ...st_move_activation_before_concatenation.py | 947 ++++++++++++++++++
 .../test_removing_nodes_with_known_outputs.py |  13 +-
 .../nxp/tests/test_split_group_convolution.py |  20 +-
 10 files changed, 1283 insertions(+), 46 deletions(-)
 create mode 100644 backends/nxp/aten_passes/move_activation_before_concat.py
 create mode 100644 backends/nxp/tests/test_move_activation_before_concatenation.py

diff --git a/backends/nxp/aten_passes/move_activation_before_concat.py b/backends/nxp/aten_passes/move_activation_before_concat.py
new file mode 100644
index 00000000000..8ba306d42e2
--- /dev/null
+++ b/backends/nxp/aten_passes/move_activation_before_concat.py
@@ -0,0 +1,102 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
+
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+
+class MoveActivationBeforeConcat(PassBase):
+    """Move some operators around in the following pattern.
+    This is a common pattern that emerges from the conversion of separable convolutions.
+    This optimization works together with joint quantization of compute nodes and activations. Without it,
+    it is not beneficial.
+
+             │                    │                               │                     │
+      ┌──────▼──────┐      ┌──────▼──────┐                 ┌──────▼──────┐       ┌──────▼──────┐
+      │ aten.conv2d │  ... │ aten.conv2d │                 │ aten.conv2d │  ...  │ aten.conv2d │
+      └──────┬──────┘      └──────┬──────┘                 └──────┬──────┘       └──────┬──────┘
+             └───────┐     ┌──────┘                               │                     │
+                  ┌──▼─────▼─┐           replace with       ┌─────▼─────┐         ┌─────▼─────┐
+                  │ aten.cat │          ──────────────►     │ aten.relu │   ...   │ aten.relu │
+                  └────┬─────┘                              └─────┬─────┘         └─────┬─────┘
+                       │                                          └───────┐     ┌───────┘
+                 ┌─────▼─────┐                                         ┌──▼─────▼─┐
+                 │ aten.relu │                                         │ aten.cat │
+                 └─────┬─────┘                                         └────┬─────┘
+                       │                                                    │
+    """
+
+    def __init__(self, neutron_target_spec: NeutronTargetSpec):
+        self.neutron_target_spec = neutron_target_spec
+
+    def call(self, module: GraphModule) -> bool:
+        def _is_concat(node_: Node) -> bool:
+            return (
+                node_.op == "call_function"
+                and node_.target == torch.ops.aten.cat.default
+            )
+
+        made_changes = False
+
+        for node in module.graph.nodes:
+            if not _is_concat(node):
+                continue  # Not cat node.
+
+            cat_node = node
+            activation = next(iter(cat_node.users))
+
+            # Check if all cat inputs nodes are conv 2D or linear 2D type and their only user is cat.
+            if not all(
+                self.neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    input_node
+                )
+                and len(input_node.users) == 1
+                for input_node in cat_node.all_input_nodes
+            ):
+                continue
+
+            # Check if following activation is supported on Neutron as fused activation.
+            if not (
+                len(cat_node.users) == 1
+                and self.neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    activation
+                )
+            ):
+                continue
+
+            # Loop all Cat input nodes and insert new activation after node.
+            for input_node in cat_node.all_input_nodes:
+                with module.graph.inserting_after(input_node):
+                    new_activation = module.graph.call_function(
+                        activation.target,
+                        args=(),
+                        kwargs=activation.kwargs,
+                    )
+
+                    new_activation.meta["source_fn_stack"] = [
+                        (
+                            new_activation.name,
+                            activation.meta["source_fn_stack"][-1][-1],
+                        )
+                    ]
+                    new_activation.meta["val"] = input_node.meta["val"]
+
+                    # Replace the uses of the input node with the new activation node.
+                    input_node.replace_all_uses_with(new_activation)
+                    new_activation.args = (input_node, *activation.args[1:])
+
+            # Replace the uses of the activation node with the cat node.
+            activation.replace_all_uses_with(cat_node)
+
+            module.graph.erase_node(activation)
+
+            made_changes = True
+
+        return PassResult(module, made_changes)
diff --git a/backends/nxp/aten_passes/neutron_aten_pass_manager.py b/backends/nxp/aten_passes/neutron_aten_pass_manager.py
index 407ebf5da61..35205c76c68 100644
--- a/backends/nxp/aten_passes/neutron_aten_pass_manager.py
+++ b/backends/nxp/aten_passes/neutron_aten_pass_manager.py
@@ -16,6 +16,9 @@
 from executorch.backends.nxp.aten_passes.fuse_linear_and_add_pass import (
     FuseLinearAndAddPass,
 )
+from executorch.backends.nxp.aten_passes.move_activation_before_concat import (
+    MoveActivationBeforeConcat,
+)
 from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import (
     RemoveNodesWithKnownOutputs,
 )
@@ -25,6 +28,7 @@
 from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import (
     SplitGRUBasedOnNumLayers,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.exir.pass_manager import PassManager
 from torch import nn
 from torch.fx.passes.infra.pass_base import PassResult
@@ -34,7 +38,9 @@
 
 class NeutronAtenPassManager(PassManager):
 
-    def __init__(self, passes: list[PassType] = None):
+    def __init__(
+        self, neutron_target_spec: NeutronTargetSpec, passes: list[PassType] = None
+    ):
         passes: list[PassType] = passes or [
             FuseBatchNormWithConvPass(),
             FuseBatchNormWithLinearPass(),
@@ -42,6 +48,7 @@ def __init__(self, passes: list[PassType] = None):
             SplitGRUBasedOnNumLayers(),
             RemoveNodesWithKnownOutputs(),
             FuseLinearAndAddPass(),
+            MoveActivationBeforeConcat(neutron_target_spec),
         ]
 
         super().__init__(passes)
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index 6564c19d7b9..f476e16628e 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -12,6 +12,7 @@
 from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.quantizer.patterns import (
     AbsPattern,
+    ActivationsConcatClusterPattern,
     AdaptiveAvgPoolPattern,
     AddmmPattern,
     AddTensorPattern,
@@ -225,13 +226,16 @@ def __init__(self, neutron_target_spec: NeutronTargetSpec):
         self.op_to_applied_quantizer = {
             pt: False for q in self.quantizers for pt in q.pattern.partition_types()
         }
+        self.cluster_quantizers = [
+            NeutronAtenQuantizer(ActivationsConcatClusterPattern(self), static_qconfig)
+        ]
 
     def transform_for_annotation(
         self, model: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
         model.graph.eliminate_dead_code()  # Remove dead code to simplify the graph for the passes.
 
-        model = NeutronAtenPassManager()(model).graph_module
+        model = NeutronAtenPassManager(self.neutron_target_spec)(model).graph_module
 
         model.graph.eliminate_dead_code()  # Remove dead code again, in case it was created by the passes.
 
@@ -240,6 +244,10 @@ def transform_for_annotation(
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
         self._annotate_inputs(model)
 
+        # Annotate node clusters in model
+        for cluster_quantizer in self.cluster_quantizers:
+            cluster_quantizer.annotate(model)
+
         nodes = list(model.graph.nodes)
         for node in nodes:
             if (
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index ccd579d5c52..ee92cd42ef1 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -13,6 +13,7 @@
 from executorch.backends.nxp.quantizer.utils import get_bias_qparams
 from torch import fx
 from torch._ops import OpOverload
+from torch.fx import Node
 from torchao.quantization.pt2e import PerChannelMinMaxObserver
 from torchao.quantization.pt2e.quantizer import (
     DerivedQuantizationSpec,
@@ -20,6 +21,7 @@
     QuantizationSpec,
     SharedQuantizationSpec,
 )
+
 from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 
@@ -199,7 +201,6 @@ def partition_types(self) -> list[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors:
-        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         addmm_node = fused_partition[0].nodes[-1]
 
         bias_qspec = DerivedQuantizationSpec(
@@ -745,3 +746,147 @@ def get_anchors(
         return get_anchors_for_fixed_quant_specs(
             fused_partition, scale=1.0 / 128.0, zero_point=0
         )
+
+
+class ActivationsConcatClusterPattern(QuantizationPattern):
+    """
+    Quantizer for activations concat cluster pattern.
+
+    The quantizer matches a pattern where concat node is preceded by activation nodes preceded by Conv 2D or Linear.
+    All activation nodes quantization parameters must be the same. Only activations, that have support for fusion
+    to preceding compute node on Neutron are allowed. This cluster is usually produced by MoveActivationBeforeConcat
+    pass. Cluster schema:
+
+            │                     │
+     ┌──────▼──────┐       ┌──────▼──────┐
+     │ aten.conv2d │  ...  │ aten.conv2d │
+     └──────┬──────┘       └──────┬──────┘
+            │                     │
+      ┌─────▼─────┐         ┌─────▼─────┐
+      │ aten.relu │   ...   │ aten.relu │
+      └─────┬─────┘         └─────┬─────┘
+            └───────┐     ┌───────┘
+                 ┌──▼─────▼─┐
+                 │ aten.cat │
+                 └────┬─────┘
+                      │
+    """
+
+    def __init__(self, neutron_quantizer):
+        self.neutron_quantizer = neutron_quantizer
+        self.neutron_target_info = (
+            self.neutron_quantizer.neutron_target_spec.neutron_target_info
+        )
+
+    @staticmethod
+    def _all_activations_are_equal(activations: list[Node]) -> bool:
+        first_input_node = activations[0]
+        hardtanh_t = [
+            torch.ops.aten.hardtanh.default,
+            torch.ops.aten.hardtanh_.default,
+        ]
+        relu_t = [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]
+        tanh_t = [
+            torch.ops.aten.tanh.default,
+            torch.ops.aten.tanh_.default,
+        ]
+
+        def _activations_are_equal(activation1: Node, activation2: Node) -> bool:
+            if (  # Targets are equal also with their inplace variants
+                (activation1.target in hardtanh_t and activation2.target in hardtanh_t)
+                or (activation1.target in relu_t and activation2.target in relu_t)
+                or (activation1.target in tanh_t and activation2.target in tanh_t)
+                or (
+                    activation1.target == torch.ops.aten.sigmoid.default
+                    and activation2.target == torch.ops.aten.sigmoid.default
+                )
+            ):
+                return True
+            elif (  # Hardtanh with min_val 0 and max_val 'inf' is equal to Relu
+                activation1.target in hardtanh_t
+                and activation1.args[1:] == (0.0, float("inf"))
+                and activation2.target in relu_t
+            ) or (
+                activation1.target in relu_t
+                and activation2.target in hardtanh_t
+                and activation2.args[1:] == (0.0, float("inf"))
+            ):
+                return True
+            else:
+                return False
+
+        return all(
+            _activations_are_equal(activation, first_input_node)
+            for activation in activations
+        )
+
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.cat.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        cat_node = fused_partition[0].nodes[-1]
+
+        # Check all cat inputs are supported activations
+        if not all(
+            self.neutron_target_info.is_supported_fused_activation__aten(input_node)
+            for input_node in cat_node.all_input_nodes
+        ):
+            return None
+
+        # Check all cat inputs are equal activations
+        if not self._all_activations_are_equal(cat_node.all_input_nodes):
+            return None
+
+        # Check compute nodes are Conv 2D or Linear
+        if not all(
+            self.neutron_target_info.is_fusable_conv_or_linear__aten(compute_node)
+            for input_node in cat_node.all_input_nodes
+            for compute_node in input_node.all_input_nodes
+        ):
+            return None
+
+        # Annotate compute nodes
+        for input_node in cat_node.all_input_nodes:
+            for compute_node in input_node.all_input_nodes:
+                if compute_node.target not in self.neutron_quantizer.op_to_quantizer:
+                    return None
+                compute_node_quantizer = self.neutron_quantizer.op_to_quantizer[
+                    compute_node.target
+                ]
+                compute_node_quantizer.annotate(gm)
+                del compute_node.meta["quantization_annotation"].output_qspec
+
+        # Annotate activations
+        for input_node in cat_node.all_input_nodes:
+            if input_node.target not in self.neutron_quantizer.op_to_quantizer:
+                return None
+            activation_quantizer = self.neutron_quantizer.op_to_quantizer[
+                input_node.target
+            ]
+            activation_quantizer.annotate(gm)
+            input_node.meta["quantization_annotation"].input_qspec_map = {}
+
+        # Annotate cat node
+        inputs = []
+        first_input_node = cat_node.all_input_nodes[0]
+        for idx in range(len(cat_node.all_input_nodes)):
+            inputs.append(
+                (
+                    cat_node,
+                    NodeArgsIdx(0, idx),
+                    SharedQuantizationSpec(first_input_node),
+                )
+            )
+        outputs = [(cat_node, SharedQuantizationSpec(first_input_node))]
+
+        return PartitionAnchors(
+            inputs=inputs,
+            weights=[],
+            biases=[],
+            output=outputs,
+        )
diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py
index fce11ce5aa2..eeb4b03d7a6 100644
--- a/backends/nxp/tests/test_batch_norm_fusion.py
+++ b/backends/nxp/tests/test_batch_norm_fusion.py
@@ -18,7 +18,10 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.view_copy_converter import (
     ViewCopyConverter,
 )
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    neutron_target_spec,
+    to_quantized_edge_program,
+)
 from executorch.backends.nxp.tests.executors import OverrideTargetSupportCheck
 from torch import nn
 
@@ -98,7 +101,7 @@ def test_batch_norm_conv_fusing(bias: bool, input_shape: list[int]):
     program = torch.export.export(module, example_input, strict=True)
     og_module = program.module()
 
-    pm = NeutronAtenPassManager()
+    pm = NeutronAtenPassManager(neutron_target_spec)
     graph_module_out = pm(deepcopy(program.module())).graph_module
 
     # Make sure the fusion worked.
@@ -133,7 +136,7 @@ def test_batch_norm_linear_fusing(bias: bool):
     program = torch.export.export(module, example_input, strict=True)
     og_module = program.module()
 
-    pm = NeutronAtenPassManager()
+    pm = NeutronAtenPassManager(neutron_target_spec)
     graph_module_out = pm(deepcopy(program.module())).graph_module
 
     # Make sure the fusion worked.
diff --git a/backends/nxp/tests/test_gru_splitting.py b/backends/nxp/tests/test_gru_splitting.py
index a2e9d324f69..297f9677fb2 100644
--- a/backends/nxp/tests/test_gru_splitting.py
+++ b/backends/nxp/tests/test_gru_splitting.py
@@ -13,6 +13,7 @@
 from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import (
     SplitGRUBasedOnNumLayers,
 )
+from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec
 
 
 @pytest.fixture(autouse=True)
@@ -94,7 +95,9 @@ def test_gru_splitting__with_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     post_pass_output = [t.detach() for t in exir_program_aten(*example_input)]
@@ -143,7 +146,9 @@ def test_gru_splitting__no_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     post_pass_output = [t.detach() for t in exir_program_aten(*example_input)]
@@ -193,7 +198,9 @@ def test_gru_splitting__bidirectional__no_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     nodes = list(exir_program_aten.graph.nodes)
@@ -239,7 +246,9 @@ def test_gru_splitting__bidirectional__with_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     nodes = list(exir_program_aten.graph.nodes)
diff --git a/backends/nxp/tests/test_linear_and_add_fusion.py b/backends/nxp/tests/test_linear_and_add_fusion.py
index 16d3c4140a2..222d748001c 100644
--- a/backends/nxp/tests/test_linear_and_add_fusion.py
+++ b/backends/nxp/tests/test_linear_and_add_fusion.py
@@ -18,6 +18,7 @@
 from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import (
     RemoveNodesWithKnownOutputs,
 )
+from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from parameterized import parameterized
 
@@ -121,10 +122,11 @@ def test_linear_add_fusing__static__no_bias__valid_shape(
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -167,10 +169,11 @@ def test_linear_add_fusing__static__no_bias__invalid_shape(
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -209,10 +212,11 @@ def test_linear_add_fusing__static__bias__valid_shape(
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -253,10 +257,11 @@ def test_linear_add_fusing__static__no_bias__reverse_order(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -295,10 +300,11 @@ def test_linear_add_fusing__static__bias__reverse_order(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -340,10 +346,11 @@ def test_linear_add_fusing__static__alpha__no_bias(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -381,10 +388,11 @@ def test_linear_add_fusing__static__alpha__bias(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -424,10 +432,11 @@ def test_linear_add_fusing__static__alpha__reversed_add_inputs(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -474,9 +483,9 @@ def test_linear_add_fusing__dynamic__no_bias__valid_shape(
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -513,9 +522,9 @@ def test_linear_add_fusing__dynamic__no_bias__invalid_shape(
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -550,9 +559,9 @@ def test_linear_add_fusing__dynamic__bias__valid_shape(
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -584,9 +593,9 @@ def test_linear_add_fusing__dynamic__reverse_order(self):
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -618,9 +627,9 @@ def test_linear_add_fusing__dynamic__alpha(self):
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
diff --git a/backends/nxp/tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/test_move_activation_before_concatenation.py
new file mode 100644
index 00000000000..779c958c049
--- /dev/null
+++ b/backends/nxp/tests/test_move_activation_before_concatenation.py
@@ -0,0 +1,947 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import unittest
+
+import kgb
+import numpy as np
+import torch
+from executorch.backends.nxp.aten_passes.move_activation_before_concat import (
+    MoveActivationBeforeConcat,
+)
+from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import (
+    NeutronAtenPassManager,
+)
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    _quantize_model,
+    get_random_calibration_inputs,
+    neutron_target_spec,
+    to_model_input_spec,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    graph_contains_any_of_ops,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import get_activation
+from executorch.exir.dialects._ops import ops as exir_ops
+from parameterized import parameterized
+from torch import nn
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
+
+concat_cluster_ops = [
+    exir_ops.edge.aten.addmm.default,
+    exir_ops.edge.aten.convolution.default,
+    exir_ops.edge.aten.hardtanh.default,
+    exir_ops.edge.aten.relu.default,
+    exir_ops.edge.aten.sigmoid.default,
+    exir_ops.edge.aten.tanh.default,
+    exir_ops.edge.aten.cat.default,
+]
+
+
+class ConvConcatActivationModule(torch.nn.Module):
+    def __init__(self, activation: str, inplace: bool, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            (3, 3),
+            padding=1,
+        )
+
+        self.activation = get_activation(activation, inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1 = self.conv(x)
+        x2 = self.conv(x)
+        x = torch.cat((x1, x2), dim=1)
+        return self.activation(x)
+
+
+class LinearConcatActivationModule(nn.Module):
+    def __init__(
+        self, activation: str, inplace: bool, in_channels: int, mode: str = "linear"
+    ):
+        super().__init__()
+        self.mode = mode.lower()
+        assert self.mode in [
+            "linear",
+            "addmm",
+            "mm",
+        ], "Mode must be 'linear', 'addmm', or 'mm'"
+
+        if self.mode == "linear":
+            self.linear = nn.Linear(in_channels, in_channels)
+        else:
+            # Manual weight and bias for addmm/mm.
+            self.weight = nn.Parameter(torch.empty(in_channels, in_channels))
+            self.bias = nn.Parameter(torch.empty(in_channels))
+            nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            nn.init.uniform_(self.bias, -bound, bound)
+
+        self.activation = get_activation(activation, inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1, x2 = None, None
+
+        if self.mode == "linear":
+            x1 = self.linear(x)
+            x2 = self.linear(x)
+        if self.mode == "addmm":
+            x1 = torch.addmm(self.bias, x, self.weight)
+            x2 = torch.addmm(self.bias, x, self.weight)
+        elif self.mode == "mm":
+            x1 = torch.mm(x, self.weight)
+            x2 = torch.mm(x, self.weight)
+
+        x = torch.cat((x1, x2), dim=1)
+        return self.activation(x)
+
+
+class ConvActivationConcatModule(torch.nn.Module):
+    def __init__(
+        self,
+        activation1: str,
+        activation2: str,
+        act1_inplace: bool,
+        act2_inplace: bool,
+        in_channels: int,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            (3, 3),
+            padding=1,
+        )
+
+        self.activation1 = get_activation(activation1, act1_inplace)
+        self.activation2 = get_activation(activation2, act2_inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1 = self.conv(x)
+        x1 = self.activation1(x1)
+        x2 = self.conv(x)
+        x2 = self.activation2(x2)
+        return torch.cat((x1, x2), dim=1)
+
+
+class LinearActivationConcatModule(torch.nn.Module):
+    def __init__(
+        self,
+        activation1: str,
+        activation2: str,
+        act1_inplace: bool,
+        act2_inplace: bool,
+        in_channels: int,
+    ):
+        super().__init__()
+        self.linear = nn.Linear(in_channels, in_channels)
+
+        self.activation1 = get_activation(activation1, act1_inplace)
+        self.activation2 = get_activation(activation2, act2_inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1 = self.linear(x)
+        x1 = self.activation1(x1)
+        x2 = self.linear(x)
+        x2 = self.activation2(x2)
+        return torch.cat((x1, x2), dim=1)
+
+
+class TestMoveActivationBeforeConcat(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests.
+
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(42)
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__conv(self, activation, inplace):
+        input_shape = (1, 3, 8, 8)
+        model = ConvConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=3
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 8
+        cat_node = nodes[5]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[6]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 9
+        cat_node = nodes[7]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[8].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check convolution and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 26
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[14]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[15]
+            )
+        )
+        assert (
+            nodes[16].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[18]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[19]
+            )
+        )
+        assert (
+            nodes[20].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__linear(self, activation, inplace):
+        input_shape = (1, 8)
+        model = LinearConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=8, mode="linear"
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 8
+        cat_node = nodes[5]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[6]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 9
+        cat_node = nodes[7]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[8].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check linear and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 22
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[10]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[11]
+            )
+        )
+        assert (
+            nodes[12].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[14]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[15]
+            )
+        )
+        assert (
+            nodes[16].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__addmm(self, activation, inplace):
+        input_shape = (1, 8)
+        model = LinearConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=8, mode="addmm"
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 8
+        cat_node = nodes[5]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[6]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 9
+        cat_node = nodes[7]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[8].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check addmm and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 22
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[10]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[11]
+            )
+        )
+        assert (
+            nodes[12].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[14]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[15]
+            )
+        )
+        assert (
+            nodes[16].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__mm(self, activation, inplace):
+        input_shape = (1, 8)
+        model = LinearConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=8, mode="mm"
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 7
+        cat_node = nodes[4]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[5]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 8
+        cat_node = nodes[6]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[7].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check mm and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 19
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[7]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[8]
+            )
+        )
+        assert (
+            nodes[9].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[11]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[12]
+            )
+        )
+        assert (
+            nodes[13].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__conv(
+        self, activation, inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8, 8, 8)
+            model = ConvConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                tflite_output_preprocess=ToChannelFirstPreprocess(),
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__linear(
+        self, activation, inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8)
+            model = LinearConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8, mode="linear"
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__addmm(
+        self, activation, inplace
+    ):
+        torch.manual_seed(23)
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8)
+            model = LinearConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8, mode="addmm"
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+                atol=1.0,
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__mm(self, activation, inplace):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8)
+            model = LinearConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8, mode="mm"
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", "relu", True, False],
+            ["relu6", "relu6", False, True],
+            ["tanh", "tanh", True, False],
+            ["sigmoid", "sigmoid", False, True],
+            ["relu", "relu_hardtanh", True, True],
+        ]
+    )
+    def test_concat_cluster_quantization__conv(
+        self, activation1, activation2, act1_inplace, act2_inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            with kgb.spy_on(_quantize_model, call_original=True) as quantizer_spy:
+                input_shape = (1, 8, 8, 8)
+                model = ConvActivationConcatModule(
+                    activation1, activation2, act1_inplace, act2_inplace, in_channels=8
+                )
+
+                edge_program = to_quantized_edge_program(
+                    model, input_shape
+                ).exported_program()
+
+                # Make sure that all nodes were delegated.
+                assert not graph_contains_any_of_ops(
+                    graph=edge_program.graph,
+                    ops=concat_cluster_ops,
+                )
+                assert any(
+                    "lowered_module" in node.name for node in edge_program.graph.nodes
+                )
+
+                tflite_flatbuffers_model, io_formats = converter_spy.calls[
+                    -1
+                ].return_value
+                exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+                exir_program_aten_quant: GraphModule = quantizer_spy.calls[-1].args[0]
+
+                # Check convolution and activation are in same QDQ cluster.
+                nodes = list(exir_program_aten_quant.graph.nodes)
+                assert len(nodes) == 26
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[14]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[15]
+                )
+                assert (
+                    nodes[16].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[18]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[19]
+                )
+                assert (
+                    nodes[20].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+
+                input_data = (
+                    np.random.random(input_shape).astype(np.float32) * 50
+                ).astype(np.int8)
+                convert_run_compare(
+                    exported_program,
+                    input_data,
+                    tfl_model=tflite_flatbuffers_model,
+                    tflite_input_preprocess=ToChannelLastPreprocess(),
+                    tflite_output_preprocess=ToChannelFirstPreprocess(),
+                )
+
+    @parameterized.expand(
+        [
+            ["relu", "relu", True, False],
+            ["relu6", "relu6", False, True],
+            ["tanh", "tanh", True, False],
+            ["sigmoid", "sigmoid", False, True],
+            ["relu", "relu_hardtanh", True, True],
+        ]
+    )
+    def test_concat_cluster_quantization__linear(
+        self, activation1, activation2, act1_inplace, act2_inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            with kgb.spy_on(_quantize_model, call_original=True) as quantizer_spy:
+                input_shape = (1, 8)
+                model = LinearActivationConcatModule(
+                    activation1, activation2, act1_inplace, act2_inplace, in_channels=8
+                )
+
+                edge_program = to_quantized_edge_program(
+                    model, input_shape
+                ).exported_program()
+
+                # Make sure that all nodes were delegated.
+                assert not graph_contains_any_of_ops(
+                    graph=edge_program.graph,
+                    ops=concat_cluster_ops,
+                )
+                assert any(
+                    "lowered_module" in node.name for node in edge_program.graph.nodes
+                )
+
+                tflite_flatbuffers_model, io_formats = converter_spy.calls[
+                    -1
+                ].return_value
+                exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+                exir_program_aten_quant: GraphModule = quantizer_spy.calls[-1].args[0]
+
+                # Check linear and activation are in same QDQ cluster.
+                nodes = list(exir_program_aten_quant.graph.nodes)
+                assert len(nodes) == 22
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[10]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[11]
+                )
+                assert (
+                    nodes[12].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[14]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[15]
+                )
+                assert (
+                    nodes[16].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+
+                input_data = (
+                    np.random.random(input_shape).astype(np.float32) * 50
+                ).astype(np.int8)
+                convert_run_compare(
+                    exported_program,
+                    input_data,
+                    tfl_model=tflite_flatbuffers_model,
+                    tflite_input_preprocess=ToChannelLastPreprocess(),
+                    tflite_output_preprocess=ToChannelFirstPreprocess(),
+                )
diff --git a/backends/nxp/tests/test_removing_nodes_with_known_outputs.py b/backends/nxp/tests/test_removing_nodes_with_known_outputs.py
index 8f5549c8526..0c496356791 100644
--- a/backends/nxp/tests/test_removing_nodes_with_known_outputs.py
+++ b/backends/nxp/tests/test_removing_nodes_with_known_outputs.py
@@ -17,6 +17,7 @@
 from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import (
     SplitGRUBasedOnNumLayers,
 )
+from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from parameterized import parameterized
 from torch import nn
@@ -57,7 +58,9 @@ def test_removing_nodes__zeros(self):
         outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
 
         # Apply the optimization.
-        NeutronAtenPassManager([RemoveNodesWithKnownOutputs()])(exir_program_aten)
+        NeutronAtenPassManager(neutron_target_spec, [RemoveNodesWithKnownOutputs()])(
+            exir_program_aten
+        )
 
         # Make sure the `aten.zeros` is no longer in the model.
         assert not graph_contains_any_of_ops(
@@ -81,7 +84,9 @@ def test_removing_nodes__split(self, num_layers):
         exir_program_aten = torch.export.export(model, example_input).module()
 
         # Apply the pass to split the `aten.gru.input` into multiple instances, and add a `split` node.
-        NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])(exir_program_aten)
+        NeutronAtenPassManager(neutron_target_spec, [SplitGRUBasedOnNumLayers()])(
+            exir_program_aten
+        )
 
         # Make sure the `aten.zeros` and `torch.split` are in the model.
         assert graph_contains_any_of_ops(
@@ -93,7 +98,9 @@ def test_removing_nodes__split(self, num_layers):
         outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
 
         # Apply the optimization.
-        NeutronAtenPassManager([RemoveNodesWithKnownOutputs()])(exir_program_aten)
+        NeutronAtenPassManager(neutron_target_spec, [RemoveNodesWithKnownOutputs()])(
+            exir_program_aten
+        )
 
         # Make sure the `aten.zeros` and `torch.split` are no longer in the model.
         assert not graph_contains_any_of_ops(
diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py
index 8b2d5723dbb..6e084699307 100644
--- a/backends/nxp/tests/test_split_group_convolution.py
+++ b/backends/nxp/tests/test_split_group_convolution.py
@@ -88,9 +88,9 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int
         graph_module = torch.export.export(module, example_input, strict=True).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager([SplitGroupConvolution()])(
-            graph_module
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [SplitGroupConvolution()]
+        )(graph_module).graph_module
 
         # Make sure the fusion worked.
         original_nodes = list(original_module.graph.nodes)
@@ -145,9 +145,9 @@ def test_split_group_convolution__1d(self, _, input_shape: list[int], group: int
         graph_module = torch.export.export(module, example_input).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager([SplitGroupConvolution()])(
-            graph_module
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [SplitGroupConvolution()]
+        )(graph_module).graph_module
 
         # Make sure the fusion worked.
         original_nodes = list(original_module.graph.nodes)
@@ -199,9 +199,9 @@ def test_split_group_convolution__3d(self, _, input_shape: list[int], group: int
         graph_module = torch.export.export(module, example_input).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager([SplitGroupConvolution()])(
-            graph_module
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [SplitGroupConvolution()]
+        )(graph_module).graph_module
 
         # Verify that the pass has NOT made any changes, as it is disabled for 3D convolution.
         original_nodes = list(original_module.graph.nodes)
@@ -233,7 +233,7 @@ def test_split_group_convolution__applied_by_default(self):
         graph_module = torch.export.export(module, example_input).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager()(
+        modified_module = NeutronAtenPassManager(neutron_target_spec)(
             graph_module
         ).graph_module  # Default passes.