diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 55c64fef326..4d2449f946c 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -7,7 +7,6 @@
 from . import arm_pass_utils  # noqa
 from .arm_pass import ArmPass  # noqa  # usort: skip
 from .add_bias_pass import AddBiasPass  # noqa
-from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
 from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .cast_bool_to_int8_pass import CastBoolToInt8Pass  # noqa
@@ -85,6 +84,7 @@
 )
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
+from .to_tosa_memory_format_pass import ToTosaMemoryFormatPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
 from .replace_inf_values_pass import ReplaceInfValues  # noqa  # usort: skip
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 7aab59ac310..7592be1d7da 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -10,7 +10,6 @@
 import executorch.backends.arm.tosa.dialect  # noqa: unused
 from executorch.backends.arm._passes import (
     AddBiasPass,
-    AnnotateChannelsLastDimOrder,
     AnnotateDecomposedMatmulPass,
     BroadcastArgsPass,
     CastBoolToInt8Pass,
@@ -84,6 +83,7 @@
     RetraceFoldedDtypesPass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
+    ToTosaMemoryFormatPass,
     UnsqueezeBeforeRepeatPass,
     UnsqueezeScalarPlaceholdersPass,
 )
@@ -162,7 +162,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
 
         self.add_pass(InsertTableOpsPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
-        self.add_pass(AnnotateChannelsLastDimOrder())
+        self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(InsertRescalePass())
 
         return self._transform(exported_program.graph_module)
@@ -241,7 +241,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(AddBiasPass(exported_program))
         self.add_pass(InsertTableOpsPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
-        self.add_pass(AnnotateChannelsLastDimOrder())
+        self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(InsertRescalePass())
 
         return self._transform(exported_program.graph_module)
diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py
index 9a25b7c28ae..99c89f474ea 100644
--- a/backends/arm/_passes/decompose_select.py
+++ b/backends/arm/_passes/decompose_select.py
@@ -7,7 +7,10 @@
 # pyre-unsafe
 
 import torch
-from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -34,8 +37,9 @@ def call(self, graph_module: torch.fx.GraphModule):
 
             input_node, dim, index = node.args
 
-            rank = len(input_node.meta["val"].size())
-            shape = input_node.meta["val"].shape
+            input_tensor = get_first_fake_tensor(input_node)
+            rank = len(input_tensor.size())
+            shape = input_tensor.shape
             dim = dim % rank if dim < 0 else dim
             index = index % shape[dim] if index < 0 else index
 
@@ -44,7 +48,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                     graph_module.graph, slice_op, (input_node, dim, index, index + 1)
                 )
                 squeeze_node = create_node(
-                    graph_module.graph, squeeze_op, (slice_node, [dim])
+                    graph_module.graph, squeeze_op, (slice_node, [dim]), from_node=node
                 )
 
             node.replace_all_uses_with(squeeze_node)
diff --git a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py b/backends/arm/_passes/to_tosa_memory_format_pass.py
similarity index 73%
rename from backends/arm/_passes/annotate_channels_last_dim_order_pass.py
rename to backends/arm/_passes/to_tosa_memory_format_pass.py
index 0ce8d667b3c..49482a70059 100644
--- a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
+++ b/backends/arm/_passes/to_tosa_memory_format_pass.py
@@ -10,13 +10,22 @@
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
+    is_param_node,
 )
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
+from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
-class AnnotateChannelsLastDimOrder(ExportPass):
+def _is_input(node: torch.fx.Node, exported_program: ExportedProgram) -> bool:
+    """
+    Returns True if the node is an input node, i.e. a placeholder or a parameter.
+    """
+    return node.op == "placeholder" and not is_param_node(exported_program, node)
+
+
+class ToTosaMemoryFormatPass(ExportPass):
     """
     Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
     that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts backend.tosa.TRANSPOSE
@@ -30,6 +39,10 @@ class AnnotateChannelsLastDimOrder(ExportPass):
     NNHWC_order = (0, 1, 3, 4, 2)
     NNHWC_inverse_order = (0, 1, 4, 2, 3)
 
+    def __init__(self, exported_program: ExportedProgram) -> None:
+        self.exported_program = exported_program
+        super().__init__()
+
     def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
         """
         returns True for w in the following sequence;
@@ -92,6 +105,11 @@ def is_channel_reshape(input_shape, output_shape):
 
     @staticmethod
     def insert_input_transpose(node, input_node, graph_module):
+        if input_node.target == exir_ops.backend.tosa.TRANSPOSE.default:
+            pre_permute_node = input_node.all_input_nodes[0]
+            node.replace_input_with(input_node, pre_permute_node)
+            return
+
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
@@ -99,18 +117,18 @@ def insert_input_transpose(node, input_node, graph_module):
                 args=(
                     input_node,
                     list(
-                        AnnotateChannelsLastDimOrder.NNHWC_inverse_order
+                        ToTosaMemoryFormatPass.NNHWC_inverse_order
                         if len(get_first_fake_tensor(input_node).size()) == 5
-                        else AnnotateChannelsLastDimOrder.NHWC_inverse_order
+                        else ToTosaMemoryFormatPass.NHWC_inverse_order
                     ),
                 ),
+                from_node=node,
             )
             node.replace_input_with(input_node, permute_node)
 
             permute_node.meta["tosa_dim_order"] = tuple(
                 range(len(input_node.meta["val"].size()))
             )
-            permute_node.meta["val"] = input_node.meta["val"]
 
     @staticmethod
     def insert_output_transpose(node, graph_module):
@@ -121,25 +139,23 @@ def insert_output_transpose(node, graph_module):
                 args=(
                     node,
                     list(
-                        AnnotateChannelsLastDimOrder.NNHWC_order
+                        ToTosaMemoryFormatPass.NNHWC_order
                         if len(get_first_fake_tensor(node).size()) == 5
-                        else AnnotateChannelsLastDimOrder.NHWC_order
+                        else ToTosaMemoryFormatPass.NHWC_order
                     ),
                 ),
+                from_node=node,
             )
+
             permute_node.meta["tosa_dim_order"] = (
-                AnnotateChannelsLastDimOrder.NNHWC_order
+                ToTosaMemoryFormatPass.NNHWC_order
                 if len(get_first_fake_tensor(node).size()) == 5
-                else AnnotateChannelsLastDimOrder.NHWC_order
-            )
-            permute_node.meta["val"] = get_first_fake_tensor(node).permute(
-                AnnotateChannelsLastDimOrder.NNHWC_order
-                if len(get_first_fake_tensor(node).size()) == 5
-                else AnnotateChannelsLastDimOrder.NHWC_order
+                else ToTosaMemoryFormatPass.NHWC_order
             )
             node.meta["tosa_dim_order"] = tuple(
                 range(len(get_first_fake_tensor(node).size()))
             )
+
             users = [user for user in node.users if user != permute_node]
             for user in users:
                 user.replace_input_with(node, permute_node)
@@ -150,20 +166,23 @@ def _insert_view_transpose(
     ):
         nchw_to_nhwc = len(input_shape) < 4 and len(output_shape) >= 4
         nhwc_to_nchw = len(input_shape) >= 4 and len(output_shape) < 4
-        channel_reshape = AnnotateChannelsLastDimOrder.is_channel_reshape(
+        channel_reshape = ToTosaMemoryFormatPass.is_channel_reshape(
             output_shape, input_shape
         )
 
         if (
             channel_reshape or nhwc_to_nchw
-        ) and AnnotateChannelsLastDimOrder.memory_format_differs(input_shape):
-            AnnotateChannelsLastDimOrder.insert_input_transpose(
+        ) and ToTosaMemoryFormatPass.memory_format_differs(input_shape):
+
+            ToTosaMemoryFormatPass.insert_input_transpose(
                 node, input_node, graph_module
             )
+
         if (
             channel_reshape or nchw_to_nhwc
-        ) and AnnotateChannelsLastDimOrder.memory_format_differs(output_shape):
-            AnnotateChannelsLastDimOrder.insert_output_transpose(node, graph_module)
+        ) and ToTosaMemoryFormatPass.memory_format_differs(output_shape):
+
+            ToTosaMemoryFormatPass.insert_output_transpose(node, graph_module)
 
     def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
         """
@@ -181,9 +200,10 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             # call_function and placeholder allowed due to
             # index.Tensor being able to come in as both
-            if node.op not in ["call_function", "placeholder"]:
+            if node.op not in ["call_function", "placeholder", "output"]:
                 continue
 
+            # Transpose views
             elif node.target in (
                 exir_ops.edge.aten.view_copy.default,
                 exir_ops.edge.aten.index.Tensor,
@@ -194,25 +214,48 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
                 input_node = node.args[0]
                 input_shape = input_node.meta["val"].shape
                 output_shape = node.meta["val"].shape
-
                 self._insert_view_transpose(
-                    input_shape, output_shape, node, input_node, graph_module
+                    input_shape,
+                    output_shape,
+                    node,
+                    input_node,
+                    graph_module,
                 )
 
+            # Transpose inputs
+            elif _is_input(node, self.exported_program):
+                input_shape = get_first_fake_tensor(node).size()
+                if len(input_shape) in (4, 5):
+                    ToTosaMemoryFormatPass.insert_output_transpose(node, graph_module)
+
+            # Transpose outputs
+            elif node.op == "output":
+                output_shape = get_first_fake_tensor(node).size()
+
+                if len(output_shape) in (4, 5):
+                    for input_node in node.all_input_nodes:
+                        ToTosaMemoryFormatPass.insert_input_transpose(
+                            node, input_node, graph_module
+                        )
+
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             node_data = get_first_fake_tensor(node).data
 
-            if node_data.dim() == 4:
+            # Inputs and outputs are always in (N)NCHW format
+            if _is_input(node, self.exported_program) or node.op == "output":
+                dim_order = tuple(range(node_data.dim()))
+            elif node_data.dim() == 4:
                 dim_order = self.NHWC_order
                 if self.is_weight_node_for_depthwise_conv2d(node):
                     # The weights of TOSA DEPTHWISE_CONV2D have shape (H, W, C, M) which corresponds to
                     # dim_order = (2, 3, 0, 1) (https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d).
                     dim_order = self.HWCM_order
             elif node_data.dim() == 5:
-                dim_order = self.NNHWC_order  # type: ignore[assignment]
+                dim_order = self.NNHWC_order
             else:
                 dim_order = tuple(range(node_data.dim()))  # type: ignore[assignment]
+
             node.meta["tosa_dim_order"] = dim_order
         # Insert TOSA transposes to convert between (N)NCHW and (N)NHWC format.
         # See insert_tosa_transposes for insertion conditions.
diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_transpose.py
index 91614874d23..accd79e8546 100644
--- a/backends/arm/operators/op_transpose.py
+++ b/backends/arm/operators/op_transpose.py
@@ -47,7 +47,14 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [
+                ts.DType.INT8,
+                ts.DType.INT16,
+                ts.DType.INT32,
+                ts.DType.FP32,
+                ts.DType.BOOL,
+                ts.DType.FP16,
+            ],
             output.tosa_spec,
         )
 
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index c91ad4021c4..bff5ff69284 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -261,9 +261,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
 
       // Select a compatible copy routine including checking for input layouts
       // which require permutation.
-      bool permuted_input_shape;
-      ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute(
-          i, tensor_in, &handles.inputs->io[i], &permuted_input_shape));
       bool both_int = tensor_in.scalar_type() == ScalarType::Int &&
           handles.inputs->io[i].elem_size == 4;
       bool both_char = tensor_in.scalar_type() == ScalarType::Char &&
@@ -273,19 +270,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       bool both_bool = tensor_in.scalar_type() == ScalarType::Bool &&
           (handles.inputs->io[i].elem_size == 1);
 
-      // Select a compatible copy routine
-      if ((both_char || both_bool) && permuted_input_shape) {
-        EXECUTORCH_PROF_SCOPE(
-            event_tracer,
-            "+EthosUBackend::execute()handles.input.permute_CHW_to_HWC()");
-        // permuted byte copy CHW to HWC
-        permute_CHW_to_HWC(
-            tensor_in.mutable_data_ptr<char>(),
-            scratch_addr,
-            tensor_in.size(1),
-            tensor_in.size(2),
-            tensor_in.size(3));
-      } else if (both_char || both_int || both_short || both_bool) {
+      if (both_char || both_int || both_short || both_bool) {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
         // Sizes match and elt size matches so memcpy
@@ -297,18 +282,16 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
         ET_LOG(Error, "No matching input copy routine");
         return Error::InvalidProgram;
       }
-      if (!permuted_input_shape) {
-        calculate_dimensions(
-            tensor_in, &handles.inputs->io[i], &tensor_count, &io_count);
-        if (tensor_count != io_count) {
-          ET_LOG(Error, "Input tensor sizes do not match");
-          ET_LOG(
-              Error,
-              "Program expects %d elements but got %d",
-              io_count,
-              tensor_count);
-          return Error::InvalidProgram;
-        }
+      calculate_dimensions(
+          tensor_in, &handles.inputs->io[i], &tensor_count, &io_count);
+      if (tensor_count != io_count) {
+        ET_LOG(Error, "Input tensor sizes do not match");
+        ET_LOG(
+            Error,
+            "Program expects %d elements but got %d",
+            io_count,
+            tensor_count);
+        return Error::InvalidProgram;
       }
     }
 
@@ -369,34 +352,13 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       tensor_dim = tensor_dim + tensor_count;
       io_dim = io_dim + io_count;
 
-      bool permuted_output_shape;
-      ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute(
-          i, tensor_out, &handles.outputs->io[i], &permuted_output_shape));
+      EXECUTORCH_PROF_SCOPE(
+          event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
 
-      if ((tensor_out.scalar_type() == ScalarType::Char ||
-           tensor_out.scalar_type() == ScalarType::Bool) &&
-          permuted_output_shape) {
-        EXECUTORCH_PROF_SCOPE(
-            event_tracer,
-            "+EthosUBackend::execute()handles.output.permute_HWC_to_CHW()");
-
-        const char* output_address = static_cast<const char*>(output_addr);
-
-        permute_HWC_to_CHW(
-            output_address,
-            tensor_out.mutable_data_ptr<char>(),
-            tensor_out.size(1),
-            tensor_out.size(2),
-            tensor_out.size(3));
-      } else {
-        EXECUTORCH_PROF_SCOPE(
-            event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
-
-        memcpy(
-            tensor_out.mutable_data_ptr<char>(),
-            static_cast<const char*>(output_addr),
-            tensor_out.nbytes());
-      }
+      memcpy(
+          tensor_out.mutable_data_ptr<char>(),
+          static_cast<const char*>(output_addr),
+          tensor_out.nbytes());
     }
     if (tensor_dim != io_dim) {
       ET_LOG(Error, "Total output tensor sizes do not match");
@@ -426,46 +388,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       *io_count = *io_count * io->shape[i];
     }
   }
-
-  Error check_requires_permute(
-      int index,
-      const executorch::aten::Tensor tensor,
-      VelaIO* io,
-      bool* is_permuted) const {
-    bool permuted_shape = false;
-
-    if (tensor.dim() == 4) {
-      // special case for NHWC workaround in AOT; as the compilation has
-      // permuted to channel last in an undetectable way, we assume here
-      // that the application has similarly permuted any input/output tensors.
-      permuted_shape = tensor.size(0) == io->shape[0] &&
-          tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] &&
-          tensor.size(3) == io->shape[2];
-      if (permuted_shape) {
-        ET_LOG(Debug, "Tensor input/output %d will be permuted", index);
-      }
-    }
-    *is_permuted = permuted_shape;
-    return Error::Ok;
-  }
-
-  void permute_CHW_to_HWC(const char* input, char* output, int C, int H, int W)
-      const {
-    for (int i = 0; i != H * W; ++i) {
-      for (int j = 0; j < C; ++j) {
-        output[i * C + j] = input[i + j * W * H];
-      }
-    }
-  }
-
-  void permute_HWC_to_CHW(const char* input, char* output, int C, int H, int W)
-      const {
-    for (int i = 0; i != H * W; ++i) {
-      for (int j = 0; j < C; ++j) {
-        output[i + j * W * H] = input[i * C + j];
-      }
-    }
-  }
 };
 
 namespace {
@@ -476,4 +398,4 @@ static auto registered = register_backend(backend_id);
 
 } // namespace arm
 } // namespace backends
-} // namespace executorch
+} // namespace executorch
\ No newline at end of file
diff --git a/backends/arm/test/models/test_nn_functional.py b/backends/arm/test/models/test_nn_functional.py
index 651f9585459..c1a9f312d85 100644
--- a/backends/arm/test/models/test_nn_functional.py
+++ b/backends/arm/test/models/test_nn_functional.py
@@ -83,6 +83,8 @@ def forward(self, *args):
     module_tests,
     xfails={
         "affine_grid": "Int64 input. Partition handling fails since arange int64 output is split between 2 partitions.",
+        "unfold": "ValueError: Invalid TOSA graph",
+        "fold": "ValueError: Invalid TOSA graph",
     },
 )
 def test_nn_functional_FP(test_data):
diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py
index 3600c34c94c..080dddda92e 100644
--- a/backends/arm/test/ops/test_amax.py
+++ b/backends/arm/test/ops/test_amax.py
@@ -95,10 +95,7 @@ def test_amax_u55_INT_not_delegated():
     pipeline.run()
 
 
-fvp_xfails = {"rank_4_mult_batches": "MLETORCH-517 : Multiple batches not supported"}
-
-
-@common.parametrize("test_data", Amax.test_data, fvp_xfails, strict=False)
+@common.parametrize("test_data", Amax.test_data)
 @common.XfailIfNoCorstone320
 def test_amax_u85_INT(test_data: Amax.input_t):
     data, dim, keep_dims = test_data()
diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py
index 3ae94fe3c6e..a24da9e1ba0 100644
--- a/backends/arm/test/ops/test_amin.py
+++ b/backends/arm/test/ops/test_amin.py
@@ -104,10 +104,7 @@ def test_amin_u55_INT_not_delegated():
     pipeline.run()
 
 
-fvp_xfails = {"rank_4_mult_batches": "MLETORCH-517 : Multiple batches not supported"}
-
-
-@common.parametrize("test_data", Amin.test_data, fvp_xfails, strict=False)
+@common.parametrize("test_data", Amin.test_data)
 @common.XfailIfNoCorstone320
 def test_amin_u85_INT(test_data: Amin.input_t):
     data, dim, keep_dims = test_data()
diff --git a/backends/arm/test/ops/test_bitwise.py b/backends/arm/test/ops/test_bitwise.py
index 1c0f0e36a6a..46e84361573 100644
--- a/backends/arm/test/ops/test_bitwise.py
+++ b/backends/arm/test/ops/test_bitwise.py
@@ -4,8 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from copy import copy
 from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -16,7 +18,6 @@
     VgfPipeline,
 )
 
-
 input_t2 = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
 
 
@@ -56,6 +57,9 @@ class BitwiseBinary(torch.nn.Module):
         ),
     }
 
+    test_data_u85 = copy(test_data)
+    del test_data_u85["zeros"]
+
 
 class BitwiseBinaryScalar(torch.nn.Module):
     test_data = {
@@ -77,6 +81,9 @@ class BitwiseBinaryScalar(torch.nn.Module):
         ),
     }
 
+    test_data_u85 = copy(test_data)
+    del test_data_u85["zeros"]
+
 
 class And(BitwiseBinary):
     aten_op = "torch.ops.aten.bitwise_and.Tensor"
@@ -226,7 +233,7 @@ def test_bitwise_and_scalar_u55_INT(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", AndScalar.test_data)
+@common.parametrize("test_data", AndScalar.test_data_u85)
 @common.XfailIfNoCorstone320
 def test_bitwise_and_scalar_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -244,7 +251,7 @@ def test_bitwise_and_scalar_u85_INT(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", And().test_data)
+@common.parametrize("test_data", And().test_data_u85)
 @common.XfailIfNoCorstone320
 def test_bitwise_and_tensor_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -427,7 +434,7 @@ def test_bitwise_xor_scalar_u55_INT(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Xor().test_data)
+@common.parametrize("test_data", Xor().test_data_u85)
 @common.XfailIfNoCorstone320
 def test_bitwise_xor_tensor_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -445,7 +452,7 @@ def test_bitwise_xor_tensor_u85_INT(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", XorScalar.test_data)
+@common.parametrize("test_data", XorScalar.test_data_u85)
 @common.XfailIfNoCorstone320
 def test_bitwise_xor_scalar_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -628,7 +635,7 @@ def test_bitwise_or_scalar_u55_INT(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Or().test_data)
+@common.parametrize("test_data", Or().test_data_u85)
 @common.XfailIfNoCorstone320
 def test_bitwise_or_tensor_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -646,7 +653,7 @@ def test_bitwise_or_tensor_u85_INT(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", OrScalar.test_data)
+@common.parametrize("test_data", OrScalar.test_data_u85)
 @common.XfailIfNoCorstone320
 def test_bitwise_or_scalar_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -730,3 +737,30 @@ def test_bitwise_or_scalar_vgf_INT(test_data: input_t2):
     pipeline.pop_stage("quantize")
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
+
+
+@pytest.mark.xfail(
+    reason="MLBEDSW-11029: Fatal Python floating point error in Vela for rank 4 bitwse ops with int32 dtype."
+)
+def test_bitwise_or_tensor_u85_INT_zeros():
+    raise RuntimeError(
+        "Dummy test to xfail mark u85 zeros test case since running the actual test causes a fatal crash."
+    )
+
+
+@pytest.mark.xfail(
+    reason="MLBEDSW-11029: Fatal Python floating point error in Vela for rank 4 bitwse ops with int32 dtype."
+)
+def test_bitwise_and_tensor_u85_INT_zeros():
+    raise RuntimeError(
+        "Dummy test to xfail mark u85 zeros test case since running the actual test causes a fatal crash."
+    )
+
+
+@pytest.mark.xfail(
+    reason="MLBEDSW-11029: Fatal Python floating point error in Vela for rank 4 bitwse ops with int32 dtype."
+)
+def test_bitwise_xor_tensor_u85_INT_zeros():
+    raise RuntimeError(
+        "Dummy test to xfail mark u85 zeros test case since running the actual test causes a fatal crash."
+    )
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index 826689622fb..55578aa15c6 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -105,15 +105,7 @@ def test_cat_tosa_INT(test_data: Tuple):
     pipeline.run()
 
 
-x_fails = {
-    "cat_rand_two_tensors_dim_0": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
-    "cat_rand_two_tensors_dim_0": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
-    "cat_rand_two_tensors_dim_3": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
-    "cat_rand_large": "MLETORCH-630: AssertionError: Output 0 does not match reference output.",
-}
-
-
-@common.parametrize("test_data", Cat.test_parameters, x_fails)
+@common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone300
 def test_cat_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -126,7 +118,7 @@ def test_cat_u55_INT(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", Cat.test_parameters, x_fails)
+@common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone320
 def test_cat_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 7a24848697e..b4f2879be48 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -106,6 +106,9 @@ def test_clone_u85_INT(test_data):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
+@pytest.mark.xfail(
+    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
+)
 def test_clone_vgf_FP(test_data):
     pipeline = VgfPipeline[input_t](
         Clone(), test_data(), aten_op, exir_op, tosa_version="TOSA-1.0+FP"
@@ -115,6 +118,9 @@ def test_clone_vgf_FP(test_data):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
+@pytest.mark.xfail(
+    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
+)
 def test_clone_vgf_INT(test_data):
     pipeline = VgfPipeline[input_t](
         Clone(),
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 0d23d2a6c7e..0300f7c2049 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -388,15 +388,6 @@ def forward(self, x):
     for q in [True, False]
 }
 
-fvp_xfails = {
-    f"{k},per_channel_quant={q}": reason
-    for k, reason in {
-        "2x2_3x2x40x40_nobias": "MLETORCH-520: Numerical issues on FVP.",
-        "5x5_3x2x128x128_st1": "MLETORCH-520: Numerical issues on FVP.",
-    }.items()
-    for q in [True, False]
-}
-
 input_t = Tuple[torch.Tensor]
 
 
@@ -426,7 +417,7 @@ def test_convolution_2d_tosa_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_INT, fvp_xfails)
+@common.parametrize("test_data", test_data_INT)
 @common.XfailIfNoCorstone300
 def test_convolution_2d_u55_INT(test_data):
     model, per_channel_quantization = test_data()
@@ -441,7 +432,7 @@ def test_convolution_2d_u55_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_INT, fvp_xfails)
+@common.parametrize("test_data", test_data_INT)
 @common.XfailIfNoCorstone320
 def test_convolution_u85_INT(test_data):
     model, per_channel_quantization = test_data()
diff --git a/backends/arm/test/ops/test_cosh.py b/backends/arm/test/ops/test_cosh.py
index 14b7def60cd..60920d03f94 100644
--- a/backends/arm/test/ops/test_cosh.py
+++ b/backends/arm/test/ops/test_cosh.py
@@ -73,7 +73,14 @@ def test_cosh_u55_INT(test_data: Tuple):
 
 
 @common.XfailIfNoCorstone320
-@common.parametrize("test_data", test_data_suite)
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ones_4D": "MLBEDSW-11046 - Incorrect output for TABLE followed by RESHAPE"
+    },
+    strict=False,
+)
 def test_cosh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Cosh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index bf6aad840ac..0f8b34d3d47 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -199,7 +199,7 @@
 
 
 @common.parametrize("test_data", test_data_conv1d_FP | test_data_conv2d_FP)
-def test_depthwise_convolution_2d_tosa_FP(test_data: torch.nn.Module):
+def test_convolution_2d_tosa_FP_depthwise(test_data: torch.nn.Module):
     pipeline = TosaPipelineFP[input_t](
         test_data(),
         test_data().get_inputs(),
@@ -211,7 +211,7 @@ def test_depthwise_convolution_2d_tosa_FP(test_data: torch.nn.Module):
 
 @pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness (MLTORCH-307)
 @common.parametrize("test_data", test_data_conv1d_INT | test_data_conv2d_INT)
-def test_depthwise_convolution_2d_tosa_INT(test_data):
+def test_convolution_2d_tosa_INT_depthwise(test_data):
     model, per_channel_quantization = test_data()
     pipeline = TosaPipelineINT[input_t](
         model,
@@ -225,7 +225,7 @@ def test_depthwise_convolution_2d_tosa_INT(test_data):
 
 @common.parametrize("test_data", test_data_conv1d_FP | test_data_conv2d_FP)
 @common.SkipIfNoModelConverter
-def test_depthwise_convolution_2d_vgf_FP(test_data: torch.nn.Module):
+def test_convolution_2d_vgf_FP_depthwise(test_data: torch.nn.Module):
     model = test_data()
     pipeline = VgfPipeline[input_t](
         model,
@@ -239,7 +239,7 @@ def test_depthwise_convolution_2d_vgf_FP(test_data: torch.nn.Module):
 
 @common.parametrize("test_data", test_data_conv1d_INT | test_data_conv2d_INT)
 @common.SkipIfNoModelConverter
-def test_depthwise_convolution_2d_vgf_INT(test_data):
+def test_convolution_2d_vgf_INT_depthwise(test_data):
     model, per_channel_quantization = test_data()
     pipeline = VgfPipeline[input_t](
         model,
@@ -251,19 +251,9 @@ def test_depthwise_convolution_2d_vgf_INT(test_data):
     pipeline.run()
 
 
-x_fails = {
-    f"{k},per_channel_quant={q}": reason
-    for k, reason in {
-        "3x3_2x8x198x198_gp8_st3": "MLETORCH-517: Operators fail with batches > 1",
-        "two_dw_conv2d": "MLETORCH-517: Operators fail with batches > 1",
-    }.items()
-    for q in [True, False]
-}
-
-
 @common.XfailIfNoCorstone300  # TODO: MLETORCH-516
-@common.parametrize("test_data", test_data_conv2d_INT, x_fails)
-def test_depthwise_convolution_2d_u55_INT(test_data):
+@common.parametrize("test_data", test_data_conv2d_INT)
+def test_convolution_2d_u55_INT_depthwise(test_data):
     model, per_channel_quantization = test_data()
     pipeline = EthosU55PipelineINT[input_t](
         model,
@@ -278,7 +268,7 @@ def test_depthwise_convolution_2d_u55_INT(test_data):
 
 @common.XfailIfNoCorstone300  # TODO: MLETORCH-516
 @common.parametrize("test_data", test_data_conv1d_INT)
-def test_depthwise_convolution_1d_u55_INT(test_data):
+def test_convolution_1d_u55_INT_depthwise(test_data):
     model, per_channel_quantization = test_data()
     pipeline = EthosU55PipelineINT[input_t](
         model,
@@ -292,8 +282,8 @@ def test_depthwise_convolution_1d_u55_INT(test_data):
 
 
 @common.XfailIfNoCorstone320  # TODO: MLETORCH-516
-@common.parametrize("test_data", test_data_conv2d_INT, x_fails)
-def test_depthwise_convolution_2d_u85_INT(test_data):
+@common.parametrize("test_data", test_data_conv2d_INT)
+def test_convolution_2d_u85_INT_depthwise(test_data):
     model, per_channel_quantization = test_data()
     pipeline = EthosU85PipelineINT[input_t](
         model,
@@ -307,8 +297,8 @@ def test_depthwise_convolution_2d_u85_INT(test_data):
 
 
 @common.XfailIfNoCorstone320  # TODO: MLETORCH-516
-@common.parametrize("test_data", test_data_conv1d_INT, x_fails)
-def test_depthwise_convolution_1d_u85_INT(test_data):
+@common.parametrize("test_data", test_data_conv1d_INT)
+def test_convolution_1d_u85_INT_depthwise(test_data):
     model, per_channel_quantization = test_data()
     pipeline = EthosU85PipelineINT[input_t](
         model,
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index 026939758a0..5bacac1c962 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -101,17 +101,7 @@ def test_div_tensor_tosa_INT(test_data: Tuple):
     pipeline.run()
 
 
-x_fails = {
-    "op_div_rank4_ones": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
-    "op_div_rank4_negative_ones": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
-    "op_div_rank4_ones_div_negative": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
-    "op_div_rank4_large_rand": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
-    "op_div_rank4_negative_large_rand": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
-    "op_div_rank4_large_randn": "MLETORCH-521: Numerical issues on FVP likely due to mul op",
-}
-
-
-@common.parametrize("test_data", test_data_suite, xfails=x_fails)
+@common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
 def test_div_tensor_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -124,7 +114,7 @@ def test_div_tensor_u55_INT(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite, xfails=x_fails)
+@common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
 def test_div_tensor_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index 607d8650946..a0a7ccadeb4 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -70,14 +70,7 @@ def test_expand_tosa_INT(test_data: Tuple):
     pipeline.run()
 
 
-x_fails = {
-    "rand_batch_2": "AssertionError: Output 0 does not match reference output.",
-    "rand_mix_neg": "AssertionError: Output 0 does not match reference output.",
-    "rand_small_neg": "AssertionError: Output 0 does not match reference output.",
-}
-
-
-@common.parametrize("test_data", Expand.test_parameters, x_fails)
+@common.parametrize("test_data", Expand.test_parameters)
 @common.XfailIfNoCorstone300
 def test_expand_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -90,7 +83,7 @@ def test_expand_u55_INT(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", Expand.test_parameters, x_fails)
+@common.parametrize("test_data", Expand.test_parameters)
 @common.XfailIfNoCorstone320
 def test_expand_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py
index c66f6d164b9..94f33d28630 100644
--- a/backends/arm/test/ops/test_ge.py
+++ b/backends/arm/test/ops/test_ge.py
@@ -153,7 +153,6 @@ def test_ge_scalar_u55_INT(test_module):
 @common.parametrize(
     "test_module",
     test_data_tensor,
-    xfails={"ge_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"},
 )
 @common.XfailIfNoCorstone320
 def test_ge_tensor_u85_INT(test_module):
@@ -170,7 +169,6 @@ def test_ge_tensor_u85_INT(test_module):
 @common.parametrize(
     "test_module",
     test_data_scalar,
-    xfails={"ge_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"},
 )
 @common.XfailIfNoCorstone320
 def test_ge_scalar_u85_INT(test_module):
diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py
index 83c85e5f9fc..41229397eb5 100644
--- a/backends/arm/test/ops/test_gt.py
+++ b/backends/arm/test/ops/test_gt.py
@@ -154,9 +154,6 @@ def test_gt_scalar_u55_INT(test_module):
 @common.parametrize(
     "test_module",
     test_data_tensor,
-    xfails={
-        "gt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
-    },
 )
 @common.XfailIfNoCorstone320
 def test_gt_tensor_u85_INT(test_module):
@@ -173,9 +170,6 @@ def test_gt_tensor_u85_INT(test_module):
 @common.parametrize(
     "test_module",
     test_data_scalar,
-    xfails={
-        "gt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
-    },
 )
 @common.XfailIfNoCorstone320
 def test_gt_scalar_u85_INT(test_module):
diff --git a/backends/arm/test/ops/test_le.py b/backends/arm/test/ops/test_le.py
index 6cb185ecb92..31422302a2d 100644
--- a/backends/arm/test/ops/test_le.py
+++ b/backends/arm/test/ops/test_le.py
@@ -155,9 +155,6 @@ def test_le_scalar_u55_INT_not_delegated(test_module):
 @common.parametrize(
     "test_module",
     test_data_tensor,
-    xfails={
-        "le_tensor_rank4_randn": "4D fails because boolean Tensors can't be subtracted"
-    },
 )
 @common.XfailIfNoCorstone320
 def test_le_tensor_u85_INT(test_module):
@@ -175,9 +172,6 @@ def test_le_tensor_u85_INT(test_module):
 @common.parametrize(
     "test_module",
     test_data_scalar,
-    xfails={
-        "le_scalar_rank4_randn": "4D fails because boolean Tensors can't be subtracted"
-    },
 )
 @common.XfailIfNoCorstone320
 def test_le_scalar_u85_INT(test_module):
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 57ce490dae8..f5cb2e952e5 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -183,24 +183,9 @@ def test_linear_u55_INT(test_data: torch.Tensor):
     ).run()
 
 
-x_fail = {
-    f"{k},per_channel_quant={q}": reason
-    for k, reason in {
-        "model_linear_rank4_zeros": "AssertionError: Output 0 does not match reference output.",
-        "model_linear_rank4_ones": "AssertionError: Output 0 does not match reference output.",
-        "model_linear_rank4_negative_ones": "AssertionError: Output 0 does not match reference output.",
-        "model_linear_rank4_rand": "AssertionError: Output 0 does not match reference output.",
-        "model_linear_rank4_negative_large_rand": "AssertionError: Output 0 does not match reference output.",
-        "model_linear_rank4_large_randn": "AssertionError: Output 0 does not match reference output.",
-    }.items()
-    for q in [True, False]
-}
-
-
 @common.parametrize(
     "test_data",
     test_data_rank1_INT | test_data_rank4_INT,
-    x_fail,
 )
 @common.XfailIfNoCorstone320
 def test_linear_u85_INT(test_data: torch.Tensor):
diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py
index 2b160ce7b50..bb7c5773342 100644
--- a/backends/arm/test/ops/test_logical.py
+++ b/backends/arm/test/ops/test_logical.py
@@ -86,6 +86,9 @@ def forward(self, tensor: torch.Tensor):
 #################
 
 
+xfails = {"rand_rank4": "MLBEDSW-11031: Output diff on u85 bool transpose."}
+
+
 @common.parametrize("test_data", And().test_data)
 def test_logical_and_tosa_FP(test_data: input_t2):
     pipeline = TosaPipelineFP[input_t2](
@@ -129,7 +132,7 @@ def test_logical_and_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", And().test_data)
+@common.parametrize("test_data", And().test_data, xfails=xfails)
 @common.XfailIfNoCorstone320
 def test_logical_and_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -223,7 +226,7 @@ def test_logical_xor_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Xor().test_data)
+@common.parametrize("test_data", Xor().test_data, xfails=xfails)
 @common.XfailIfNoCorstone320
 def test_logical_xor_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -317,7 +320,7 @@ def test_logical_or_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Or().test_data)
+@common.parametrize("test_data", Or().test_data, xfails=xfails)
 @common.XfailIfNoCorstone320
 def test_logical_or_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -411,7 +414,7 @@ def test_logical_not_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Not().test_data)
+@common.parametrize("test_data", Not().test_data, xfails=xfails)
 @common.XfailIfNoCorstone320
 def test_logical_not_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index b1b934fbcc8..791069aa4b0 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -68,7 +68,7 @@ def test_log_softmax_tosa_INT(test_data):
     "test_data",
     LogSoftmax.test_data,
     xfails={
-        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
+        "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55."
     },
 )
 @common.XfailIfNoCorstone300()
@@ -85,13 +85,7 @@ def test_log_softmax_u55_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_data",
-    LogSoftmax.test_data,
-    xfails={
-        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
-    },
-)
+@common.parametrize("test_data", LogSoftmax.test_data)
 @common.XfailIfNoCorstone320
 def test_log_softmax_u85_INT(test_data):
     data, dim = test_data()
diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py
index 86d903e3f88..98d0298b195 100644
--- a/backends/arm/test/ops/test_lt.py
+++ b/backends/arm/test/ops/test_lt.py
@@ -154,9 +154,6 @@ def test_lt_scalar_u55_INT_not_delegated(test_module):
 @common.parametrize(
     "test_module",
     test_data_tensor,
-    xfails={
-        "lt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
-    },
 )
 @common.XfailIfNoCorstone320
 def test_lt_tensor_u85_INT(test_module):
@@ -173,9 +170,6 @@ def test_lt_tensor_u85_INT(test_module):
 @common.parametrize(
     "test_module",
     test_data_scalar,
-    xfails={
-        "lt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
-    },
 )
 @common.XfailIfNoCorstone320
 def test_lt_scalar_u85_INT(test_module):
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index 6b75c2b7d0a..7db56311837 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -65,12 +65,10 @@
         torch.rand(1, 16, 54, 54),
         [3, (1, 3), 1],
     ),
-}
-
-test_data_suite_mult_batches = {
     "randn": lambda: (torch.randn(5, 16, 50, 32), [4, 2, 0]),
 }
 
+
 test_data_suite_dilation = [
     # Simple dilation=2 on 8x8 input, kernel=3, stride=1, no padding
     ("dilation2", torch.rand(1, 1, 8, 8), [3, 1, 0, 2]),
@@ -161,61 +159,6 @@ def test_max_pool2d_u85_INT(test_data: torch.Tensor):
     ).run()
 
 
-@common.parametrize("test_data", test_data_suite_mult_batches)
-def test_max_pool2d_tosa_FP_mult_batches(test_data: torch.Tensor):
-    test_data, model_params = test_data()
-    pipeline = TosaPipelineFP[input_t1](
-        MaxPool2d(*model_params),
-        (test_data,),
-        aten_op,
-        exir_op,
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite_mult_batches)
-def test_max_pool2d_tosa_INT_mult_batches(test_data: torch.Tensor):
-    test_data, model_params = test_data()
-    pipeline = TosaPipelineINT[input_t1](
-        MaxPool2d(*model_params),
-        (test_data,),
-        aten_op,
-        exir_op,
-    )
-    pipeline.run()
-
-
-x_fail = {"randn": "MLETORCH-986: Numerical issues with mutli batches."}
-
-
-@common.parametrize("test_data", test_data_suite_mult_batches, x_fail)
-@common.XfailIfNoCorstone300
-def test_max_pool2d_u55_INT_mult_batches(test_data: torch.Tensor):
-    test_data, model_params = test_data()
-    EthosU55PipelineINT[input_t1](
-        MaxPool2d(*model_params),
-        (test_data,),
-        aten_op,
-        exir_ops=[],
-        run_on_fvp=True,
-        use_to_edge_transform_and_lower=True,
-    ).run()
-
-
-@common.parametrize("test_data", test_data_suite_mult_batches, x_fail)
-@common.XfailIfNoCorstone320
-def test_max_pool2d_u85_INT_mult_batches(test_data: torch.Tensor):
-    test_data, model_params = test_data()
-    EthosU85PipelineINT[input_t1](
-        MaxPool2d(*model_params),
-        (test_data,),
-        aten_op,
-        exir_op,
-        run_on_fvp=True,
-        use_to_edge_transform_and_lower=True,
-    ).run()
-
-
 reject_data_suite = {
     "reject_1": lambda: (MaxPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)),
     "reject_2": lambda: (MaxPool2d((1, 257), 1, 0), torch.rand(1, 16, 5, 300)),
@@ -306,34 +249,6 @@ def test_max_pool2d_vgf_INT(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite_mult_batches)
-@common.SkipIfNoModelConverter
-def test_max_pool2d_vgf_FP_mult_batches(test_data: torch.Tensor):
-    test_data, model_params = test_data()
-    pipeline = VgfPipeline[input_t1](
-        MaxPool2d(*model_params),
-        (test_data,),
-        aten_op,
-        exir_op,
-        tosa_version="TOSA-1.0+FP",
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite_mult_batches)
-@common.SkipIfNoModelConverter
-def test_max_pool2d_vgf_INT_mult_batches(test_data: torch.Tensor):
-    test_data, model_params = test_data()
-    pipeline = VgfPipeline[input_t1](
-        MaxPool2d(*model_params),
-        (test_data,),
-        aten_op,
-        exir_op,
-        tosa_version="TOSA-1.0+INT",
-    )
-    pipeline.run()
-
-
 @common.parametrize("test_data", dilation_test_data)
 @common.SkipIfNoModelConverter
 def test_max_pool2d_vgf_FP_dilation(test_data: torch.Tensor):
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 1483b5d82b6..061e8da14f1 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -280,6 +280,7 @@ def test_mean_dim_tosa_INT(test_data):
         (test_data,),
         [],  # Might be sum, avgpool, or both
         symmetric_io_quantization=True,
+        custom_path="MEANDIM",
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index b0b7f5f4b7d..d8f9e947ce3 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -200,15 +200,7 @@ def test_mul_tensor_u85_INT(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_data",
-    test_data_suite_int32,
-    xfails={
-        # TODO: MLETORCH-1132 Investigate why tests with inputs that require broadcasting fail on u55/u85
-        "op_mul_rank4_randn_mutltiple_broadcasts_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int",
-        "op_mul_rank4_randn_broadcast_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int",
-    },
-)
+@common.parametrize("test_data", test_data_suite_int32)
 @common.XfailIfNoCorstone300
 def test_mul_tensor_u55_INT_int32(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -222,15 +214,7 @@ def test_mul_tensor_u55_INT_int32(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_data",
-    test_data_suite_int32,
-    xfails={
-        # TODO: MLETORCH-1132 Investigate why tests with inputs that require broadcasting fail on u55/u85
-        "op_mul_rank4_randn_mutltiple_broadcasts_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int",
-        "op_mul_rank4_randn_broadcast_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int",
-    },
-)
+@common.parametrize("test_data", test_data_suite_int32)
 @common.XfailIfNoCorstone320
 def test_mul_tensor_u85_INT_int32(test_data: torch.Tensor):
     pipeline = EthosU85PipelineINT[input_t1](
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 57f7f9603a1..eb482bcee54 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -72,13 +72,11 @@ def test_permute_tosa_INT(test_data: torch.Tensor):
     pipeline.run()
 
 
-x_fails = {
-    "rank_4_2": "AssertionError: Output 0 does not match reference output.",
-    "rank_4_3": "AssertionError: Output 0 does not match reference output.",
-}
-
-
-@common.parametrize("test_data", test_data_suite, x_fails)
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={"rank_4_3": "MLETORCH-955 : Permutation numerical diff for u55"},
+)
 @common.XfailIfNoCorstone300
 def test_permute_u55_INT(test_data):
     test_data, dims = test_data()
@@ -92,8 +90,7 @@ def test_permute_u55_INT(test_data):
     pipeline.run()
 
 
-# Fails since on FVP since N > 1 is not supported. MLETORCH-517
-@common.parametrize("test_data", test_data_suite, x_fails)
+@common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
 def test_permute_u85_INT(test_data: torch.Tensor):
     test_data, dims = test_data()
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index dcf5a4a181b..4c3887f1e18 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -102,12 +102,7 @@ def test_select_int_tosa_INT(test_data: Tuple):
     pipeline.run()
 
 
-x_fails = {
-    "select4d_0_dim_2_index": "AssertionError: Output 0 does not match reference output."
-}
-
-
-@common.parametrize("test_data", test_data_suite, x_fails)
+@common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
 def test_select_int_u55_INT_copy(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -121,7 +116,7 @@ def test_select_int_u55_INT_copy(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite, x_fails)
+@common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
 def test_select_int_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -148,7 +143,7 @@ def test_select_int_u55_INT_not_delegated(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite, x_fails)
+@common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
 def test_select_int_u85_INT_copy(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
@@ -162,7 +157,7 @@ def test_select_int_u85_INT_copy(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite, x_fails)
+@common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
 def test_select_int_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py
index 3d70881a3f0..4252d96bac9 100644
--- a/backends/arm/test/ops/test_sigmoid_16bit.py
+++ b/backends/arm/test/ops/test_sigmoid_16bit.py
@@ -103,9 +103,6 @@ def test_sigmoid_tosa_INT(test_data):
 @common.parametrize(
     "test_data",
     test_data_suite,
-    xfails={
-        "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787"
-    },
     strict=False,
 )
 def test_sigmoid_tosa_INT_add_sigmoid(test_data):
@@ -121,14 +118,6 @@ def test_sigmoid_tosa_INT_add_sigmoid(test_data):
     pipeline.run()
 
 
-xfails = {
-    "ones": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
-    "rand": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
-    "rand_4d": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
-    "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787",
-}
-
-
 @common.parametrize(
     "test_data",
     test_data_suite,
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index 4bbd4d83285..dc258f20ec4 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -64,8 +64,8 @@ def test_softmax_tosa_INT(test_data):
 @common.parametrize(
     "test_data",
     Softmax.test_data,
-    xfails={
-        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
+    {
+        "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55."
     },
 )
 @common.XfailIfNoCorstone300
@@ -77,13 +77,7 @@ def test_softmax_u55_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_data",
-    Softmax.test_data,
-    xfails={
-        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
-    },
-)
+@common.parametrize("test_data", Softmax.test_data)
 @common.XfailIfNoCorstone320
 def test_softmax_u85_INT(test_data):
     data, dim = test_data()
diff --git a/backends/arm/test/ops/test_sqrt.py b/backends/arm/test/ops/test_sqrt.py
index 00ec1f48af8..15e2dd45322 100644
--- a/backends/arm/test/ops/test_sqrt.py
+++ b/backends/arm/test/ops/test_sqrt.py
@@ -40,11 +40,6 @@ def forward(self, x):
     }
 
 
-fvp_xfails = {
-    "sqrt_tensor_rank4_multibatch": "MLETORCH-517 : Multiple batches not supported",
-}
-
-
 @common.parametrize("test_data", Sqrt.test_data)
 def test_sqrt_tosa_FP(test_data: Sqrt.input_t):
     pipeline = TosaPipelineFP[Sqrt.input_t](
@@ -67,7 +62,7 @@ def test_sqrt_tosa_INT(test_data: Sqrt.input_t):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sqrt.test_data, fvp_xfails)
+@common.parametrize("test_data", Sqrt.test_data)
 @common.XfailIfNoCorstone300
 def test_sqrt_u55_INT(test_data: Sqrt.input_t):
     pipeline = EthosU55PipelineINT[Sqrt.input_t](
@@ -80,7 +75,7 @@ def test_sqrt_u55_INT(test_data: Sqrt.input_t):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sqrt.test_data, fvp_xfails)
+@common.parametrize("test_data", Sqrt.test_data)
 @common.XfailIfNoCorstone320
 def test_sqrt_u85_INT(test_data: Sqrt.input_t):
     pipeline = EthosU85PipelineINT[Sqrt.input_t](
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index e89fee04b62..ab6612393b8 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -31,8 +31,6 @@
     "zeros": lambda: (torch.zeros(10),),
 }
 
-fvp_sub_xfails = {"rand_4D_2x3x4x5": "MLETORCH-517 : Multiple batches not supported"}
-
 # Two-input subtraction (x - y)
 sub2_test_data = {
     "rand_2D_4x4": lambda: (torch.rand(4, 4), torch.rand(4, 4)),
@@ -46,7 +44,6 @@
     "rand_3d_rand_Scalar": lambda: (torch.rand(1, 6, 2), torch.rand(1)),
     "rand_3d_Scalar": lambda: (torch.rand(1, 6, 2), 1),
 }
-fvp_sub2_xfails = {"rand_4D_2x2x4x4": "MLETORCH-517 : Multiple batches not supported"}
 
 
 class Sub(torch.nn.Module):
@@ -111,7 +108,7 @@ def test_sub_tensor_tosa_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
-@common.parametrize("test_data", sub_test_data, fvp_sub_xfails)
+@common.parametrize("test_data", sub_test_data)
 @common.XfailIfNoCorstone300
 def test_sub_tensor_u55_INT(test_data):
     """Test Subtraction on Ethos-U55 (FVP Mode)"""
@@ -125,7 +122,7 @@ def test_sub_tensor_u55_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails)
+@common.parametrize("test_data", sub2_test_data)
 @common.XfailIfNoCorstone300
 def test_sub_tensor_u55_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     """Test Two-Operand Subtraction on Ethos-U55 (FVP Mode)"""
@@ -139,7 +136,7 @@ def test_sub_tensor_u55_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
-@common.parametrize("test_data", sub_test_data, fvp_sub_xfails)
+@common.parametrize("test_data", sub_test_data)
 @common.XfailIfNoCorstone320
 def test_sub_tensor_u85_INT_2(test_data):
     """Test Subtraction on Ethos-U85 (FVP Mode)"""
@@ -153,7 +150,7 @@ def test_sub_tensor_u85_INT_2(test_data):
     pipeline.run()
 
 
-@common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails)
+@common.parametrize("test_data", sub2_test_data)
 @common.XfailIfNoCorstone320
 def test_sub_tensor_u85_INT(test_data: Tuple[torch.Tensor, torch.Tensor]):
     """Test Two-Operand Subtraction on Ethos-U85 (FVP Mode)"""
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 71cb2ed73bb..0aa6f9a0245 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -82,22 +82,7 @@ def test_view_tosa_INT(test_data: Tuple):
     pipeline.run()
 
 
-xfails = {
-    "rand_4d_neg": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_4d_small": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_4d": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_2d": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_3d": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_1": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_2": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_2_4_big": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_4_3": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_4_2": "MLETORCH-517: Multiple batches not supported",
-    "rand_4d_2_4_same": "MLETORCH-517: Multiple batches not supported",
-}
-
-
-@common.parametrize("test_data", View.needs_transpose_tests, xfails=xfails)
+@common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone300
 def test_view_u55_INT(test_data: Tuple):
     test_tensor, new_shape = test_data()
@@ -136,7 +121,7 @@ def test_view_vgf_INT(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", View.rank_product_too_large, xfails=xfails)
+@common.parametrize("test_data", View.rank_product_too_large)
 @common.XfailIfNoCorstone300
 def test_view_u55_INT_not_delegated(test_data: Tuple):
     test_tensor, new_shape = test_data()
@@ -151,7 +136,7 @@ def test_view_u55_INT_not_delegated(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", View.needs_transpose_tests, xfails=xfails)
+@common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone320
 def test_view_u85_INT(test_data: Tuple):
     test_tensor, new_shape = test_data()
diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py
index 7ede72d9c4d..ae6c414e884 100644
--- a/backends/arm/test/passes/test_rescale_pass.py
+++ b/backends/arm/test/passes/test_rescale_pass.py
@@ -172,7 +172,14 @@ def test_quantized_rescale_tosa_bi(test_data: tuple[torch.Tensor, torch.Tensor])
     pipeline.run()
 
 
-@common.parametrize("test_data", RescaleNetwork.test_data)
+u55_xfails = {
+    "ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.",
+    "randn_ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.",
+    "randn_large": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.",
+}
+
+
+@common.parametrize("test_data", RescaleNetwork.test_data, xfails=u55_xfails)
 @common.XfailIfNoCorstone300
 def test_quantized_rescale_u55(test_data: tuple[torch.Tensor, torch.Tensor]):
     """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and
diff --git a/backends/arm/test/passes/test_to_tosa_memory_format.py b/backends/arm/test/passes/test_to_tosa_memory_format.py
new file mode 100644
index 00000000000..1e9b8ffc63d
--- /dev/null
+++ b/backends/arm/test/passes/test_to_tosa_memory_format.py
@@ -0,0 +1,192 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes import ToTosaMemoryFormatPass
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    PassPipeline,
+    TosaPipelineINT,
+)
+from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class NoNHWC(torch.nn.Module):
+    """
+    Test-module with no ops requiring NHWC mermory format.
+    """
+
+    ops_after_pass = {"executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 2}
+    ops_not_after_pass = []
+
+    def forward(self, x):
+        x = x + x
+        return x
+
+    def get_inputs(self):
+        return (torch.rand(1, 2, 2, 2),)
+
+
+class ParallelClusters(torch.nn.Module):
+    """
+    Test-module with multiple parallel clusters of nodes requiring different memory formats.
+    """
+
+    ops_after_pass = {"executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 2}
+    ops_not_after_pass = []
+
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=2,
+            out_channels=2,
+            kernel_size=1,
+            bias=True,
+        )
+        self.maxpool = torch.nn.MaxPool2d(1, 1)
+        self.avgpool = torch.nn.AvgPool2d(1, 1)
+
+    def forward(self, x):
+        x1 = self.conv(x)
+        x2 = self.maxpool(x)
+        x3 = self.avgpool(x)
+        x4 = x * x
+        return x1 + x2 + x3 + x4
+
+    def get_inputs(self):
+        return (torch.rand(1, 2, 2, 2),)
+
+
+class SerialClusters(torch.nn.Module):
+    """
+    Test-module with multiple serial clusters of nodes requring different memory formats.
+    """
+
+    ops_before_pass = {}
+    ops_after_pass = {"executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 4}
+    ops_not_after_pass = []
+
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=2,
+            out_channels=2,
+            kernel_size=1,
+            bias=True,
+        )
+        self.fc = torch.nn.Linear(
+            in_features=2,
+            out_features=2,
+            bias=True,
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = x * x
+        x = self.conv(x)
+        x = x.view((2, 1, 2, 4))
+        x = x * 2
+        x = x.view((2, 2, 2, 2))
+        x = self.conv(x)
+        return x
+
+    def get_inputs(self):
+        return (torch.rand(2, 2, 2, 2),)
+
+
+class Reshapes(torch.nn.Module):
+    """
+    Test-module with different configurations of views requiring different memory formats.
+    """
+
+    ops_before_pass = {}
+    ops_after_pass = {
+        "executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 16
+    }
+    ops_not_after_pass = []
+
+    def __init__(self):
+        super().__init__()
+        self.maxpool = torch.nn.MaxPool2d(1, 1)  # Use maxpool to force NHWC format
+
+    def forward(self, x):
+
+        x = self.maxpool(x)
+        x = x.view((2, 2, 4, 16, 1))  # N-C-HW-invariant intact, no transposes needed
+        x = x * 2  # Add op to avoid views merging
+        x = x.view((4, 4, 4, 4))
+        x = x / 2  # Add op to avoid views merging
+        x = self.maxpool(x)
+
+        x = x.view((256))  # Break N-C-HW invariant
+        x = x * 2
+        x = x.view((4, 4, 4, 4))
+        x = x / 2
+        x = self.maxpool(x)
+
+        x = x.view((16, 16))  # Break N-C-HW invariant
+        x = x * 2
+        x = x.view((4, 4, 4, 4))
+        x = x / 2
+        x = self.maxpool(x)
+
+        x = x.view((16, 4, 4))  # Break N-C-HW invariant
+        x = x * 2
+        x = x.view((4, 4, 4, 4))
+        x = x / 2
+        x = self.maxpool(x)
+
+        x = x.view((2, 4, 4, 8))  # Break N-C-HW invariant
+        x = x * 2
+        x = x.view((4, 4, 4, 4))
+        x = x / 2
+        x = self.maxpool(x)
+
+        x = x.view((8, 1, 2, 4, 4))  # Break N-C-HW invariant
+        x = x * 2
+        x = x.view((4, 4, 4, 4))
+        x = self.maxpool(x)
+
+        return x
+
+    def get_inputs(self):
+        return (torch.rand(4, 4, 4, 4),)
+
+
+modules = {
+    "no_nhwc": NoNHWC(),
+    "parallel_clusters": ParallelClusters(),
+    "serial_clusters": SerialClusters(),
+    "reshapes": Reshapes(),
+}
+
+
+@common.parametrize("module", modules)
+def test_to_tosa_memory_format_tosa_INT(module):
+    # We cannot check op counts after a specific pass with the full pipeline
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        ops_after_pass=module.ops_after_pass,
+        ops_not_after_pass=module.ops_not_after_pass,
+        pass_list=[RemoveGetItemPass],
+        passes_with_exported_program=[ToTosaMemoryFormatPass],
+    )
+    pipeline.pop_stage(
+        "run_method_and_compare_outputs"
+    )  # Eager execution is not possible after introducing tosa.TRANSPOSE
+    pipeline.run()
+
+
+@common.parametrize("module", modules)
+def test_to_tosa_memory_format_tosa_INT_functional(module):
+    # Also run the actual pass pipeline to ensure functional correctness.
+    pipeline = TosaPipelineINT[input_t](module, module.get_inputs(), [])
+    pipeline.run()
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 4335e96c730..523a3f30a54 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -667,7 +667,6 @@ def run_tosa_graph(
 ) -> list[torch.Tensor]:
     """Runs the TOSA reference model with inputs and returns the result."""
     inputs_np = [input.numpy() for input in inputs]
-    transpose_data_format(inputs_np, to="NHWC")
 
     if isinstance(tosa_version, Tosa_1_00):
         import tosa_reference_model as reference_model
@@ -689,24 +688,9 @@ def run_tosa_graph(
         status == reference_model.GraphStatus.TOSA_VALID
     ), "Non-valid TOSA given to reference model."
 
-    transpose_data_format(outputs_np, to="NCHW")
     return [torch.from_numpy(output) for output in outputs_np]
 
 
-def transpose_data_format(data: list[np.ndarray], to: Literal["NHWC", "NCHW"]):
-    for i in range(len(data)):
-        if hasattr(data[i], "shape") and data[i].ndim in (4, 5):
-            match to:
-                case "NCHW":
-                    dim_order = (0, 3, 1, 2) if data[i].ndim == 4 else (0, 1, 4, 2, 3)
-                case "NHWC":
-                    dim_order = (0, 2, 3, 1) if data[i].ndim == 4 else (0, 1, 3, 4, 2)
-                case _:
-                    raise NotImplementedError(f"Cant transpose to dim order {to}")
-            # Copy is needed to force actual data conversion, not setting stride.
-            data[i] = np.transpose(data[i], dim_order).copy()
-
-
 def get_target_board(compile_spec: list[CompileSpec]) -> str | None:
     if is_vgf(compile_spec):
         return "vkml_emulation_layer"